gehad5454 commited on
Commit
57bc59f
·
verified ·
1 Parent(s): 7089770

Upload 23 files

Browse files
counsel-chat-master/.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .DS_Store
2
+ __pycache__
3
+ .zip
4
+ .idea
counsel-chat-master/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2020 nbertagnolli
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
counsel-chat-master/Makefile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Interact Variables
2
+ # defaults
3
+ CHECKPOINT_DIR = "counselchat-convai"
4
+ DATA = "data/counsel_chat_250-tokens_full.json"
5
+ TEMPERATURE = 0.7
6
+ MAX_LENGTH = 30
7
+ MAX_HISTORY = 0
8
+
9
+ # HELP
10
+ # This will output the help for each task
11
+ # thanks to https://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
12
+ .PHONY: help
13
+
14
+ help: ## This help.
15
+ @awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
16
+
17
+ .DEFAULT_GOAL := help
18
+
19
+
20
+ # DOCKER TASKS
21
+ # Build the container
22
+ build:
23
+ docker system prune && \
24
+ docker build -t convai -f docker/Dockerfile .
25
+
26
+ ## Interact with the model
27
+ interact:
28
+ docker run -it --rm -v $(shell pwd):$(shell pwd) convai:latest \
29
+ /bin/sh -c 'cd $(shell pwd); \
30
+ python3 interact.py --model_checkpoint $(CHECKPOINT_DIR) --temperature $(TEMPERATURE) --max_length $(MAX_LENGTH) --max_history $(MAX_HISTORY) --dataset_path $(DATA); \
31
+ '
counsel-chat-master/README.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # counsel-chat
2
+ This repository holds the code for working with data from counselchat.com. The scarped data are from individiuals seeking assistance from licensed therapists and their associated responses. The goal is to provide a high quality open source dataset of quality counseling responses.
3
+
4
+ I've recently added the data to [HuggingFace](https://huggingface.co/datasets/nbertagnolli/counsel-chat) so getting the data should be as easy as:
5
+
6
+ ```python
7
+ from datasets import load_dataset
8
+
9
+ dataset = load_dataset("nbertagnolli/counsel-chat")
10
+ ```
11
+
12
+ There is a larger writeup available on [medium](https://medium.com/towards-data-science/counsel-chat-bootstrapping-high-quality-therapy-data-971b419f33da)
13
+
14
+ If you use this data in your work please cite the medium article.
15
+
16
+ ```
17
+ @misc{bertagnolli2020counsel,
18
+ title={Counsel chat: Bootstrapping high-quality therapy data},
19
+ author={Bertagnolli, Nicolas},
20
+ year={2020},
21
+ publisher={Towards Data Science. https://towardsdatascience. com/counsel-chat~…}
22
+ }
23
+ ```
24
+
25
+
counsel-chat-master/counsel_chat.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
counsel-chat-master/data/20200325_counsel_chat.csv ADDED
The diff for this file is too large to render. See raw diff
 
counsel-chat-master/data/counsel_chat_250-tokens_full.json ADDED
The diff for this file is too large to render. See raw diff
 
counsel-chat-master/data/counselchat-data.csv ADDED
The diff for this file is too large to render. See raw diff
 
counsel-chat-master/docker/Dockerfile ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM ubuntu:18.04
2
+
3
+ MAINTAINER Loreto Parisi [email protected]
4
+
5
+ ######################################## BASE SYSTEM
6
+ # set noninteractive installation
7
+ ARG DEBIAN_FRONTEND=noninteractive
8
+ RUN apt-get update && apt-get install -y apt-utils
9
+ RUN apt-get install -y --no-install-recommends \
10
+ build-essential \
11
+ pkg-config \
12
+ tzdata \
13
+ curl
14
+
15
+ ######################################## PYTHON3
16
+ RUN apt-get install -y \
17
+ python3 \
18
+ python3-pip
19
+
20
+ # set local timezone
21
+ RUN ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \
22
+ dpkg-reconfigure --frontend noninteractive tzdata
23
+
24
+ # transfer-learning-conv-ai
25
+ ENV PYTHONPATH /usr/local/lib/python3.6
26
+ COPY . .
27
+ COPY docker/requirements.txt /tmp/requirements.txt
28
+ RUN pip3 install --upgrade pip && pip3 install -r /tmp/requirements.txt
29
+
30
+ # model zoo
31
+ RUN mkdir models && \
32
+ curl https://s3.amazonaws.com/models.huggingface.co/transfer-learning-chatbot/finetuned_chatbot_gpt.tar.gz > models/finetuned_chatbot_gpt.tar.gz && \
33
+ cd models/ && \
34
+ tar -xvzf finetuned_chatbot_gpt.tar.gz && \
35
+ rm finetuned_chatbot_gpt.tar.gz
36
+
37
+ CMD ["bash"]
counsel-chat-master/docker/requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ torch
3
+ pytorch-ignite
4
+ transformers==2.5.1
5
+ tensorboardX==1.8
6
+ jupyter
7
+ scikit-learn
8
+ pandas
9
+ dill
10
+ scipy
counsel-chat-master/interact.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # # Copyright (c) 2019-present, HuggingFace Inc.
2
+ # All rights reserved.
3
+ # This source code is licensed under the BSD-style license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+ import logging
6
+ import json
7
+ import os
8
+ import random
9
+ from argparse import ArgumentParser
10
+ from itertools import chain
11
+ from pprint import pformat
12
+ import tempfile
13
+ import tarfile
14
+ import warnings
15
+
16
+ import torch
17
+ import torch.nn.functional as F
18
+ from transformers import cached_path
19
+
20
+ from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, GPT2LMHeadModel, \
21
+ GPT2Tokenizer
22
+
23
+ HF_FINETUNED_MODEL = "https://s3.amazonaws.com/models.huggingface.co/transfer-learning-chatbot/gpt_personachat_cache.tar.gz"
24
+ SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
25
+ ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
26
+ 'additional_special_tokens': ['<speaker1>', '<speaker2>']}
27
+
28
+
29
+ def download_pretrained_model():
30
+ """ Download and extract finetuned model from S3 """
31
+ resolved_archive_file = cached_path(HF_FINETUNED_MODEL)
32
+ tempdir = tempfile.mkdtemp()
33
+ with tarfile.open(resolved_archive_file, 'r:gz') as archive:
34
+ def is_within_directory(directory, target):
35
+
36
+ abs_directory = os.path.abspath(directory)
37
+ abs_target = os.path.abspath(target)
38
+
39
+ prefix = os.path.commonprefix([abs_directory, abs_target])
40
+
41
+ return prefix == abs_directory
42
+
43
+ def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
44
+
45
+ for member in tar.getmembers():
46
+ member_path = os.path.join(path, member.name)
47
+ if not is_within_directory(path, member_path):
48
+ raise Exception("Attempted Path Traversal in Tar File")
49
+
50
+ tar.extractall(path, members, numeric_owner=numeric_owner)
51
+
52
+
53
+ safe_extract(archive, tempdir)
54
+ return tempdir
55
+
56
+
57
+ def get_dataset(tokenizer, dataset_path, dataset_cache):
58
+ """ Get tokenized PERSONACHAT dataset from S3 or cache."""
59
+ dataset_path = dataset_path
60
+ dataset_cache = dataset_cache + '_' + type(tokenizer).__name__ # To avoid using GPT cache for GPT-2 and vice-versa
61
+ if dataset_cache and os.path.isfile(dataset_cache):
62
+ dataset = torch.load(dataset_cache)
63
+ else:
64
+ personachat_file = cached_path(dataset_path)
65
+ with open(personachat_file, "r", encoding="utf-8") as f:
66
+ dataset = json.loads(f.read())
67
+
68
+ def tokenize(obj):
69
+ if isinstance(obj, str):
70
+ return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
71
+ if isinstance(obj, dict):
72
+ return dict((n, tokenize(o)) for n, o in obj.items())
73
+ return list(tokenize(o) for o in obj)
74
+ dataset = tokenize(dataset)
75
+ torch.save(dataset, dataset_cache)
76
+ return dataset
77
+
78
+
79
+ def add_special_tokens_(model, tokenizer):
80
+ """ Add special tokens to the tokenizer and the model if they have not already been added. """
81
+ orig_num_tokens = len(tokenizer.encoder)
82
+ num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there
83
+ if num_added_tokens > 0:
84
+ model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens)
85
+
86
+
87
+ def build_input_from_segments(persona, history, reply, tokenizer, lm_labels=False, with_eos=True):
88
+ """ Build a sequence of input from 3 segments: persona, history and last reply. """
89
+ bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
90
+ sequence = [[bos] + list(chain(*persona))] + history + [reply + ([eos] if with_eos else [])]
91
+ sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])]
92
+ instance = {}
93
+ instance["input_ids"] = list(chain(*sequence))
94
+ instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
95
+ instance["mc_token_ids"] = len(instance["input_ids"]) - 1
96
+ instance["lm_labels"] = [-100] * len(instance["input_ids"])
97
+ if lm_labels:
98
+ instance["lm_labels"] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]
99
+ return instance
100
+
101
+
102
+ def top_filtering(logits, top_k=0., top_p=0.9, threshold=-float('Inf'),
103
+ filter_value=-float('Inf')):
104
+ """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering
105
+ Args:
106
+ logits: logits distribution shape (vocabulary size)
107
+ top_k: <=0: no filtering, >0: keep only top k tokens with highest probability.
108
+ top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset
109
+ whose total probability mass is greater than or equal to the threshold top_p.
110
+ In practice, we select the highest probability tokens whose cumulative probability mass exceeds
111
+ the threshold top_p.
112
+ threshold: a minimal threshold to keep logits
113
+ """
114
+ assert logits.dim() == 1 # Only work for batch size 1 for now - could update but it would obfuscate a bit the code
115
+ top_k = min(top_k, logits.size(-1))
116
+ if top_k > 0:
117
+ # Remove all tokens with a probability less than the last token in the top-k tokens
118
+ indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
119
+ logits[indices_to_remove] = filter_value
120
+
121
+ if top_p > 0.0:
122
+ # Compute cumulative probabilities of sorted tokens
123
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
124
+ cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
125
+
126
+ # Remove tokens with cumulative probability above the threshold
127
+ sorted_indices_to_remove = cumulative_probabilities > top_p
128
+ # Shift the indices to the right to keep also the first token above the threshold
129
+ sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
130
+ sorted_indices_to_remove[..., 0] = 0
131
+
132
+ # Back to unsorted indices and set them to -infinity
133
+ indices_to_remove = sorted_indices[sorted_indices_to_remove]
134
+ logits[indices_to_remove] = filter_value
135
+
136
+ indices_to_remove = logits < threshold
137
+ logits[indices_to_remove] = filter_value
138
+
139
+ return logits
140
+
141
+
142
+ def sample_sequence(personality, history, tokenizer, model, args, current_output=None):
143
+ special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)
144
+ if current_output is None:
145
+ current_output = []
146
+
147
+ for i in range(args.max_length):
148
+ instance = build_input_from_segments(personality, history, current_output,
149
+ tokenizer, with_eos=False)
150
+
151
+ input_ids = torch.tensor(instance["input_ids"], device=args.device).unsqueeze(0)
152
+ token_type_ids = torch.tensor(instance["token_type_ids"],
153
+ device=args.device).unsqueeze(0)
154
+
155
+ logits = model(input_ids, token_type_ids=token_type_ids)
156
+ if isinstance(logits, tuple): # for gpt2 and maybe others
157
+ logits = logits[0]
158
+ logits = logits[0, -1, :] / args.temperature
159
+ logits = top_filtering(logits, top_k=args.top_k, top_p=args.top_p)
160
+ probs = F.softmax(logits, dim=-1)
161
+
162
+ prev = torch.topk(probs, 1)[1] if args.no_sample else torch.multinomial(probs, 1)
163
+ if i < args.min_length and prev.item() in special_tokens_ids:
164
+ while prev.item() in special_tokens_ids:
165
+ if probs.max().item() == 1:
166
+ warnings.warn(
167
+ "Warning: model generating special token with probability 1.")
168
+ break # avoid infinitely looping over special token
169
+ prev = torch.multinomial(probs, num_samples=1)
170
+
171
+ if prev.item() in special_tokens_ids:
172
+ break
173
+ current_output.append(prev.item())
174
+
175
+ return current_output
176
+
177
+
178
+ def run():
179
+ parser = ArgumentParser()
180
+ parser.add_argument("--dataset_path", type=str, default="",
181
+ help="Path or url of the dataset. If empty download from S3.")
182
+ parser.add_argument("--dataset_cache", type=str, default='./dataset_cache',
183
+ help="Path or url of the dataset cache")
184
+ parser.add_argument("--model", type=str, default="openai-gpt",
185
+ help="Model type (openai-gpt or gpt2)", choices=['openai-gpt',
186
+ 'gpt2']) # anything besides gpt2 will load openai-gpt
187
+ parser.add_argument("--model_checkpoint", type=str, default="",
188
+ help="Path, url or short name of the model")
189
+ parser.add_argument("--max_history", type=int, default=2,
190
+ help="Number of previous utterances to keep in history")
191
+ parser.add_argument("--device", type=str,
192
+ default="cuda" if torch.cuda.is_available() else "cpu",
193
+ help="Device (cuda or cpu)")
194
+
195
+ parser.add_argument("--no_sample", action='store_true',
196
+ help="Set to use greedy decoding instead of sampling")
197
+ parser.add_argument("--max_length", type=int, default=20,
198
+ help="Maximum length of the output utterances")
199
+ parser.add_argument("--min_length", type=int, default=1,
200
+ help="Minimum length of the output utterances")
201
+ parser.add_argument("--seed", type=int, default=0, help="Seed")
202
+ parser.add_argument("--temperature", type=float, default=0.7,
203
+ help="Sampling softmax temperature")
204
+ parser.add_argument("--top_k", type=int, default=0,
205
+ help="Filter top-k tokens before sampling (<=0: no filtering)")
206
+ parser.add_argument("--top_p", type=float, default=0.9,
207
+ help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
208
+ args = parser.parse_args()
209
+
210
+ logging.basicConfig(level=logging.INFO)
211
+ logger = logging.getLogger(__file__)
212
+ logger.info(pformat(args))
213
+
214
+ if args.model_checkpoint == "":
215
+ if args.model == 'gpt2':
216
+ raise ValueError(
217
+ "Interacting with GPT2 requires passing a finetuned model_checkpoint")
218
+ else:
219
+ args.model_checkpoint = download_pretrained_model()
220
+
221
+ if args.seed != 0:
222
+ random.seed(args.seed)
223
+ torch.random.manual_seed(args.seed)
224
+ torch.cuda.manual_seed(args.seed)
225
+
226
+ logger.info("Get pretrained model and tokenizer")
227
+ tokenizer_class, model_class = (
228
+ GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (
229
+ OpenAIGPTTokenizer, OpenAIGPTLMHeadModel)
230
+ tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
231
+ model = model_class.from_pretrained(args.model_checkpoint)
232
+ model.to(args.device)
233
+ add_special_tokens_(model, tokenizer)
234
+
235
+ logger.info("Sample a personality")
236
+ dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)
237
+ personalities = [dialog["personality"] for dataset in dataset.values() for dialog in
238
+ dataset]
239
+ personality = random.choice(personalities)
240
+ logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))
241
+
242
+ history = []
243
+ while True:
244
+ raw_text = input(">>> ")
245
+ while not raw_text:
246
+ print('Prompt should not be empty!')
247
+ raw_text = input(">>> ")
248
+ history.append(tokenizer.encode(raw_text))
249
+ with torch.no_grad():
250
+ out_ids = sample_sequence(personality, history, tokenizer, model, args)
251
+ history.append(out_ids)
252
+ history = history[-(2 * args.max_history + 1):]
253
+ out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
254
+ print(out_text)
255
+
256
+
257
+ if __name__ == "__main__":
258
+ run()
counsel-chat-master/train.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2019-present, HuggingFace Inc.
2
+ # All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree.
3
+ import os
4
+ import math
5
+ import logging
6
+ from pprint import pformat
7
+ from argparse import ArgumentParser
8
+ from collections import defaultdict
9
+ from itertools import chain
10
+
11
+ import torch
12
+ from torch.nn.parallel import DistributedDataParallel
13
+ from torch.utils.data import DataLoader, TensorDataset
14
+ from ignite.engine import Engine, Events
15
+ from ignite.handlers import ModelCheckpoint
16
+ from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage
17
+ from ignite.contrib.handlers import ProgressBar, PiecewiseLinear
18
+ from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler
19
+ from transformers import (AdamW, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
20
+ GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME)
21
+
22
+ from utils import get_dataset, make_logdir
23
+
24
+ SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
25
+ ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
26
+ 'additional_special_tokens': ['<speaker1>', '<speaker2>']}
27
+ MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"]
28
+ PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]
29
+
30
+ logger = logging.getLogger(__file__)
31
+
32
+ def average_distributed_scalar(scalar, args):
33
+ """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """
34
+ if args.local_rank == -1:
35
+ return scalar
36
+ scalar_t = torch.tensor(scalar, dtype=torch.float, device=args.device) / torch.distributed.get_world_size()
37
+ torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM)
38
+ return scalar_t.item()
39
+
40
+
41
+ def pad_dataset(dataset, padding=0):
42
+ """ Pad the dataset. This could be optimized by defining a Dataset class and padding at the batch level, but this is simpler. """
43
+ max_l = max(len(x) for x in dataset["input_ids"])
44
+ for name in PADDED_INPUTS:
45
+ dataset[name] = [x + [padding if name != "lm_labels" else -100] * (max_l - len(x)) for x in dataset[name]]
46
+ return dataset
47
+
48
+
49
+ def add_special_tokens_(model, tokenizer):
50
+ """ Add special tokens to the tokenizer and the model if they have not already been added. """
51
+ orig_num_tokens = len(tokenizer.encoder)
52
+ num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there
53
+ if num_added_tokens > 0:
54
+ model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens)
55
+
56
+ def build_input_from_segments(persona, history, reply, tokenizer, lm_labels=False, with_eos=True):
57
+ """ Build a sequence of input from 3 segments: persona, history and last reply. """
58
+ bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
59
+ sequence = [[bos] + list(chain(*persona))] + history + [reply + ([eos] if with_eos else [])]
60
+ sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])]
61
+ instance = {}
62
+ instance["input_ids"] = list(chain(*sequence))
63
+ instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
64
+ instance["mc_token_ids"] = len(instance["input_ids"]) - 1
65
+ instance["lm_labels"] = [-100] * len(instance["input_ids"])
66
+ if lm_labels:
67
+ instance["lm_labels"] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]
68
+ return instance
69
+
70
+
71
+ def get_data_loaders(args, tokenizer):
72
+ """ Prepare the dataset for training and evaluation """
73
+ personachat = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)
74
+
75
+ logger.info("Build inputs and labels")
76
+ datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
77
+ for dataset_name, dataset in personachat.items():
78
+ num_candidates = len(dataset[0]["utterances"][0]["candidates"])
79
+ if args.num_candidates > 0 and dataset_name == 'train':
80
+ num_candidates = min(args.num_candidates, num_candidates)
81
+ for dialog in dataset:
82
+ persona = dialog["personality"].copy()
83
+ for _ in range(args.personality_permutations):
84
+ for utterance in dialog["utterances"]:
85
+ history = utterance["history"][-(2*args.max_history+1):]
86
+ for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
87
+ lm_labels = bool(j == num_candidates-1)
88
+ instance = build_input_from_segments(persona, history, candidate, tokenizer, lm_labels)
89
+ for input_name, input_array in instance.items():
90
+ datasets[dataset_name][input_name].append(input_array)
91
+ datasets[dataset_name]["mc_labels"].append(num_candidates - 1)
92
+ datasets[dataset_name]["n_candidates"] = num_candidates
93
+ persona = [persona[-1]] + persona[:-1] # permuted personalities
94
+
95
+ logger.info("Pad inputs and convert to Tensor")
96
+ tensor_datasets = {"train": [], "valid": []}
97
+ for dataset_name, dataset in datasets.items():
98
+ dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
99
+ for input_name in MODEL_INPUTS:
100
+ tensor = torch.tensor(dataset[input_name])
101
+ if input_name != "mc_labels":
102
+ tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
103
+ tensor_datasets[dataset_name].append(tensor)
104
+
105
+ logger.info("Build train and validation dataloaders")
106
+ train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
107
+ train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None
108
+ valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None
109
+ train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, shuffle=(not args.distributed))
110
+ valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, shuffle=False)
111
+
112
+ logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape))
113
+ logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape))
114
+ return train_loader, valid_loader, train_sampler, valid_sampler
115
+
116
+
117
+ def train():
118
+ parser = ArgumentParser()
119
+ parser.add_argument("--dataset_path", type=str, default="/Users/tetracycline/repos/datascience/datascience/projects/counsel_chat_all_data_300-tokens.json", help="Path or url of the dataset. If empty download from S3.")
120
+ parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
121
+ parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model")
122
+ parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training")
123
+ parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history")
124
+ parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training")
125
+ parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation")
126
+ parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps")
127
+ parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate")
128
+ parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient")
129
+ parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient")
130
+ parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm")
131
+ parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs")
132
+ parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences")
133
+ parser.add_argument("--eval_before_start", action='store_true', help="If true start with a first evaluation before training")
134
+ parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
135
+ parser.add_argument("--fp16", type=str, default="", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
136
+ parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)")
137
+ args = parser.parse_args()
138
+
139
+ # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
140
+ logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
141
+ logger.warning("Running process %d", args.local_rank) # This is a logger.warning: it will be printed by all distributed processes
142
+ logger.info("Arguments: %s", pformat(args))
143
+
144
+ # Initialize distributed training if needed
145
+ args.distributed = (args.local_rank != -1)
146
+ if args.distributed:
147
+ torch.cuda.set_device(args.local_rank)
148
+ args.device = torch.device("cuda", args.local_rank)
149
+ torch.distributed.init_process_group(backend='nccl', init_method='env://')
150
+
151
+ logger.info("Prepare tokenizer, pretrained model and optimizer.")
152
+ tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path
153
+ tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
154
+
155
+
156
+ model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel
157
+ model = model_class.from_pretrained(args.model_checkpoint)
158
+ model.to(args.device)
159
+ # Add special tokens if they are not already added
160
+ add_special_tokens_(model, tokenizer)
161
+ optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)
162
+
163
+ # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
164
+ if args.fp16:
165
+ from apex import amp # Apex is only required if we use fp16 training
166
+ model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16)
167
+ if args.distributed:
168
+ model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)
169
+
170
+ logger.info("Prepare datasets")
171
+ train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer)
172
+
173
+ # Training function and trainer
174
+ def update(engine, batch):
175
+ model.train()
176
+ batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
177
+ input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
178
+ (lm_loss), (mc_loss), *_ = model(
179
+ input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
180
+ mc_labels=mc_labels, lm_labels=lm_labels
181
+ )
182
+ loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps
183
+ if args.fp16:
184
+ with amp.scale_loss(loss, optimizer) as scaled_loss:
185
+ scaled_loss.backward()
186
+ torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm)
187
+ else:
188
+ loss.backward()
189
+ torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
190
+ if engine.state.iteration % args.gradient_accumulation_steps == 0:
191
+ optimizer.step()
192
+ optimizer.zero_grad()
193
+ return loss.item()
194
+ trainer = Engine(update)
195
+
196
+ # Evaluation function and evaluator (evaluator output is the input of the metrics)
197
+ def inference(engine, batch):
198
+ model.eval()
199
+ with torch.no_grad():
200
+ batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
201
+ input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
202
+ logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
203
+ # if we dont send labels to model, it doesnt return losses
204
+ lm_logits, mc_logits, *_ = model(
205
+ input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
206
+ )
207
+ lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
208
+ lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
209
+ return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)
210
+ evaluator = Engine(inference)
211
+
212
+ # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
213
+ trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
214
+ if args.n_epochs < 1:
215
+ trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
216
+ if args.eval_before_start:
217
+ trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))
218
+
219
+ # Make sure distributed data samplers split the dataset nicely between the distributed processes
220
+ if args.distributed:
221
+ trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
222
+ evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))
223
+
224
+ # Linearly decrease the learning rate from lr to zero
225
+ scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
226
+ trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
227
+
228
+ # Prepare metrics - note how we compute distributed metrics
229
+ RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
230
+ metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])),
231
+ "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
232
+ metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args),
233
+ "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)})
234
+ metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
235
+ for name, metric in metrics.items():
236
+ metric.attach(evaluator, name)
237
+
238
+ # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
239
+ if args.local_rank in [-1, 0]:
240
+ pbar = ProgressBar(persist=True)
241
+ pbar.attach(trainer, metric_names=["loss"])
242
+ evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))
243
+
244
+ log_dir = make_logdir(args.model_checkpoint)
245
+ tb_logger = TensorboardLogger(log_dir)
246
+
247
+ tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
248
+ tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
249
+ tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)
250
+
251
+ checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3)
252
+ trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" takes care of distributed encapsulation
253
+
254
+ torch.save(args, log_dir + '/model_training_args.bin')
255
+ getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))
256
+ tokenizer.save_pretrained(log_dir)
257
+
258
+ # Run the training
259
+ trainer.run(train_loader, max_epochs=args.n_epochs)
260
+
261
+ # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
262
+ if args.local_rank in [-1, 0] and args.n_epochs > 0:
263
+ os.rename(os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner)
264
+ tb_logger.close()
265
+
266
+ if __name__ == "__main__":
267
+ train()
counsel-chat-master/utils.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any, Callable, List, Tuple, Optional, Union
2
+ import torch
3
+ import pandas as pd
4
+ import numpy as np
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ from sklearn import metrics
8
+ from sklearn.utils.multiclass import unique_labels
9
+ from sklearn.base import BaseEstimator, TransformerMixin
10
+ import re
11
+ import logging
12
+ import json
13
+ from datetime import datetime
14
+ import logging
15
+ import os
16
+ import tarfile
17
+ import tempfile
18
+ import socket
19
+
20
+ import torch
21
+
22
+ from transformers import cached_path
23
+
24
+ PERSONACHAT_URL = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"
25
+ HF_FINETUNED_MODEL = "https://s3.amazonaws.com/models.huggingface.co/transfer-learning-chatbot/gpt_personachat_cache.tar.gz"
26
+
27
+ logger = logging.getLogger(__file__)
28
+
29
+ def download_pretrained_model():
30
+ """ Download and extract finetuned model from S3 """
31
+ resolved_archive_file = cached_path(HF_FINETUNED_MODEL)
32
+ tempdir = tempfile.mkdtemp()
33
+ logger.info("extracting archive file {} to temp dir {}".format(resolved_archive_file, tempdir))
34
+ with tarfile.open(resolved_archive_file, 'r:gz') as archive:
35
+ def is_within_directory(directory, target):
36
+
37
+ abs_directory = os.path.abspath(directory)
38
+ abs_target = os.path.abspath(target)
39
+
40
+ prefix = os.path.commonprefix([abs_directory, abs_target])
41
+
42
+ return prefix == abs_directory
43
+
44
+ def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
45
+
46
+ for member in tar.getmembers():
47
+ member_path = os.path.join(path, member.name)
48
+ if not is_within_directory(path, member_path):
49
+ raise Exception("Attempted Path Traversal in Tar File")
50
+
51
+ tar.extractall(path, members, numeric_owner=numeric_owner)
52
+
53
+
54
+ safe_extract(archive, tempdir)
55
+ return tempdir
56
+
57
+
58
+ def get_dataset(tokenizer, dataset_path, dataset_cache):
59
+ """ Get tokenized PERSONACHAT dataset from S3 or cache."""
60
+ dataset_path = dataset_path or PERSONACHAT_URL
61
+ dataset_cache = dataset_cache + '_' + type(tokenizer).__name__ # To avoid using GPT cache for GPT-2 and vice-versa
62
+ if dataset_cache and os.path.isfile(dataset_cache):
63
+ logger.info("Load tokenized dataset from cache at %s", dataset_cache)
64
+ dataset = torch.load(dataset_cache)
65
+ else:
66
+ logger.info("Download dataset from %s", dataset_path)
67
+ personachat_file = cached_path(dataset_path)
68
+ with open(personachat_file, "r", encoding="utf-8") as f:
69
+ dataset = json.loads(f.read())
70
+
71
+ logger.info("Tokenize and encode the dataset")
72
+ def tokenize(obj):
73
+ if isinstance(obj, str):
74
+ return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
75
+ if isinstance(obj, dict):
76
+ return dict((n, tokenize(o)) for n, o in obj.items())
77
+ return list(tokenize(o) for o in obj)
78
+ dataset = tokenize(dataset)
79
+ torch.save(dataset, dataset_cache)
80
+ return dataset
81
+
82
+
83
+ class AttrDict(dict):
84
+ def __init__(self, *args, **kwargs):
85
+ super(AttrDict, self).__init__(*args, **kwargs)
86
+ self.__dict__ = self
87
+
88
+
89
+ def make_logdir(model_name: str):
90
+ """Create unique path to save results and checkpoints, e.g. runs/Sep22_19-45-59_gpu-7_gpt2"""
91
+ # Code copied from ignite repo
92
+ current_time = datetime.now().strftime('%b%d_%H-%M-%S')
93
+ logdir = os.path.join(
94
+ 'runs', current_time + '_' + socket.gethostname() + '_' + model_name)
95
+ return logdir
96
+
97
+
98
+ def calculate_classification_metrics(
99
+ y_true: np.array,
100
+ y_pred: np.array,
101
+ average: Optional[str] = None,
102
+ return_df: bool = True,
103
+ ) -> Union[Dict[str, float], pd.DataFrame]:
104
+ """Computes f1, precision, recall, kappa, accuracy, and support
105
+
106
+ Args:
107
+ y_true: The true labels
108
+ y_pred: The predicted labels
109
+ average: How to average multiclass results
110
+ return_df: Returns a dataframe if true otherwise a dictionary of performance
111
+ values.
112
+
113
+ Returns:
114
+ Either a dataframe of the performance metrics or a single dictionary
115
+ """
116
+ labels = unique_labels(y_true, y_pred)
117
+
118
+ # get results
119
+ precision, recall, f_score, support = metrics.precision_recall_fscore_support(
120
+ y_true, y_pred, labels=labels, average=average
121
+ )
122
+
123
+ kappa = metrics.cohen_kappa_score(y_true, y_pred, labels=labels)
124
+ accuracy = metrics.accuracy_score(y_true, y_pred)
125
+
126
+ # create a pandas DataFrame
127
+ if return_df:
128
+ results = pd.DataFrame(
129
+ {
130
+ "class": labels,
131
+ "f_score": f_score,
132
+ "precision": precision,
133
+ "recall": recall,
134
+ "support": support,
135
+ "kappa": kappa,
136
+ "accuracy": accuracy,
137
+ }
138
+ )
139
+ else:
140
+ results = {
141
+ "f1": f_score,
142
+ "precision": precision,
143
+ "recall": recall,
144
+ "kappa": kappa,
145
+ "accuracy": accuracy,
146
+ }
147
+
148
+ return results
149
+
150
+
151
+ def visualize_performance(
152
+ df: pd.DataFrame,
153
+ metrics: List[str],
154
+ ax: Optional[Any] = None,
155
+ title: Optional[str] = None,
156
+ ylim: Optional[Tuple[float, float]] = None,
157
+ figsize: Optional[Tuple[int, int]] = None,
158
+ use_class_names: bool = True
159
+ ) -> None:
160
+ """Takes a Performance DF and converts it to a bar plot performance graph
161
+
162
+ Args:
163
+ df: A dataframe where each row is a class and each column is a metric
164
+ metrics: A list of metrics from the columns of df to plot
165
+ ax: A matplotlib axes object that we want to draw the plot on
166
+ title: The title of the plot
167
+ ylim: The minimum and maximum range for the yaxis.
168
+ figsize: The width and height of the figure. This does nothing if ax is set
169
+ use_class_names: This will label the x ticks with the class name in a multiclass setting.
170
+ """
171
+ unstacked_df = (
172
+ df[metrics]
173
+ .T.unstack()
174
+ .reset_index()
175
+ .rename(
176
+ index=str, columns={"level_0": "class", "level_1": "metric", 0: "score"}
177
+ )
178
+ )
179
+
180
+ if use_class_names:
181
+ unstacked_df["class"] = unstacked_df["class"].apply(
182
+ lambda x: df["class"].tolist()[x]
183
+ )
184
+
185
+ if figsize is None:
186
+ figsize = (10, 7)
187
+
188
+ # Diplay the graph
189
+ if ax is None:
190
+ fig, ax = plt.subplots(1, 1, figsize=(10, 7))
191
+
192
+ sns.barplot(x="class", y="score", hue="metric", data=unstacked_df, ax=ax)
193
+
194
+ # Format the graph
195
+ ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
196
+ if title is not None:
197
+ ax.set_title(title, fontsize=20)
198
+
199
+ if ylim is not None:
200
+ ax.set_ylim(ylim)
201
+
202
+ plt.tight_layout()
203
+
204
+
205
+ class BertTransformer(BaseEstimator, TransformerMixin):
206
+ """See https://towardsdatascience.com/build-a-bert-sci-kit-transformer-59d60ddd54a5#d608"""
207
+ def __init__(
208
+ self,
209
+ bert_tokenizer,
210
+ bert_model,
211
+ max_length: int = 60,
212
+ embedding_func: Optional[Callable[[Tuple[torch.tensor]], torch.tensor]] = None,
213
+ ):
214
+ self.tokenizer = bert_tokenizer
215
+ self.model = bert_model
216
+ self.model.eval()
217
+ self.max_length = max_length
218
+ self.embedding_func = embedding_func
219
+
220
+ if self.embedding_func is None:
221
+ self.embedding_func = lambda x: x[0][:, 0, :]
222
+
223
+ # TODO:: PADDING
224
+
225
+ def _tokenize(self, text: str):
226
+ tokenized_text = self.tokenizer.encode_plus(
227
+ text, add_special_tokens=True, max_length=self.max_length
228
+ )["input_ids"]
229
+ attention_mask = [1] * len(tokenized_text)
230
+
231
+ # bert takes in a batch so we need to unsqueeze the rows
232
+ return (
233
+ torch.tensor(tokenized_text).unsqueeze(0),
234
+ torch.tensor(attention_mask).unsqueeze(0),
235
+ )
236
+
237
+ def _tokenize_and_predict(self, text: str):
238
+ tokenized, attention_mask = self._tokenize(text)
239
+
240
+ embeddings = self.model(tokenized, attention_mask)
241
+ return self.embedding_func(embeddings)
242
+
243
+ def transform(self, text: List[str]):
244
+ if isinstance(text, pd.Series):
245
+ text = text.tolist()
246
+
247
+ with torch.no_grad():
248
+ return torch.stack([self._tokenize_and_predict(string) for string in text])
249
+
250
+ def fit(self, X, y=None):
251
+ """No fitting necessary so we just return ourselves"""
252
+ return self
253
+
254
+
255
+ def convert_df_to_conv_ai_dict(df: pd.DataFrame,
256
+ personality: List[str],
257
+ response_columns: List[str],
258
+ tokenizer: Callable[[str], List[str]],
259
+ max_tokens: Optional[int] = None,
260
+ n_candidates: int = 6
261
+ ) -> Dict[str, List[Any]]:
262
+ """
263
+ Each entry in personachat is a dict with two keys personality and utterances, the dataset is a list of entries.
264
+ personality: list of strings containing the personality of the agent
265
+ utterances: list of dictionaries, each of which has two keys which are lists of strings.
266
+ candidates: [next_utterance_candidate_1, ..., next_utterance_candidate_19]
267
+ The last candidate is the ground truth response observed in the conversational data
268
+ history: [dialog_turn_0, ... dialog_turn N], where N is an odd number since the other user starts every conversation.
269
+ Preprocessing:
270
+ - Spaces before periods at end of sentences
271
+ - everything lowercase
272
+
273
+ Process each row of a DataFrame. For each row:
274
+ 1. Grab the conversational input text
275
+ 2. Grab A the responses
276
+ 3. Create a unique data entry for each response to the question.
277
+ 4. Sample random response sentences from the dataset.
278
+ 5. Combine the random responses into a candidate list.
279
+
280
+ Args:
281
+ df: The counsel chat pandas dataframe
282
+ personality: The personality we would like to use during training
283
+ response_columns: Columns which contain valid responses to the question. For example,
284
+ the answerText column is the complete response of the therapist
285
+ tokenizer: The transformers library tokenizer associated with the model we will be
286
+ training. It is used for setting the maximum sequence length
287
+ max_tokens: The maximum number of tokens that any candidate, response, or question should be.
288
+ n_candidates: The number of candidate phrases to include in the dataset for training.
289
+ The last member of candidates is the ground truth response
290
+
291
+ Returns:
292
+ A dictionary with a train and validation key.
293
+ """
294
+ # Add one because the index of the dataframe is the 0th position.
295
+ tuple_map = {name: index + 1 for index, name in enumerate(df.columns.tolist())}
296
+
297
+ train = []
298
+ val = []
299
+ # Step through every row in the dictionary
300
+ for row in df.itertuples():
301
+
302
+ # Get the question name and title
303
+ # TODO:: MAKE THIS GENERAL YOU DUMB DUMB
304
+ question_title = row[tuple_map["questionTitle"]]
305
+ question_text = row[tuple_map["questionText"]]
306
+ question_combined = question_title + " " + question_text
307
+
308
+ # Step through every response column in the row
309
+ for response_column in response_columns:
310
+
311
+ # Get the true response
312
+ true_response = row[tuple_map[response_column]]
313
+
314
+ # We only want to add data if a good response exists
315
+ if len(true_response) > 1:
316
+ # Get candidate alternate sentances by sampling from all other questions
317
+ candidates = sample_candidates(df, row[tuple_map["questionID"]], "questionID", "answerText",
318
+ n_candidates)
319
+
320
+ # Add the correct response to the end
321
+ candidates.append(true_response)
322
+
323
+ # We want to trim the size of the tokens
324
+ if max_tokens is not None:
325
+ # Use the provided tokenizer to tokenize the input and truncate at max_tokens
326
+ question_combined = tokenizer.convert_tokens_to_string(
327
+ tokenizer.tokenize(question_combined)[:max_tokens])
328
+ candidates = [tokenizer.convert_tokens_to_string(tokenizer.tokenize(candidate)[:max_tokens]) for
329
+ candidate in candidates]
330
+
331
+ if len(candidates) != n_candidates + 1:
332
+ print(true_response)
333
+ assert False
334
+
335
+ # Define the personality and the history
336
+ d = {"personality": personality,
337
+ "utterances": [{"history": [question_combined],
338
+ "candidates": candidates}]}
339
+ if getattr(row, "split") == "train":
340
+ train.append(d)
341
+ elif getattr(row, "split") == "val":
342
+ val.append(d)
343
+
344
+ data = {"train": train, "valid": val}
345
+
346
+ return data
347
+
348
+
349
+ def sample_candidates(df: pd.DataFrame, current_id: Any, id_column: str, text_column: str, n: int) -> List[str]:
350
+ """Samples candidate responses to a question from the dataframe
351
+
352
+ It is aware of data splits and only samples from within the same split. This avoids
353
+ leaking information between training validation and testing. The sampled responses are
354
+ also drawn from all rows which do not have the same id as the current_id
355
+
356
+ Args:
357
+ df: The dataframe we want to sample responses from
358
+ current_id: The unique identifier we would like to leave out of our sampling
359
+ id_column: The column name in the dataframe with the unique ids. current_id should
360
+ be an element of this column
361
+ text_column: The column with the text we want to sample
362
+ n: How many samples we want to take.
363
+
364
+ Returns:
365
+ A list of samples strings from our dataframe.
366
+ """
367
+ # We must only sample candidates from the correct data split to avoid information leakage across channels
368
+ split = df[df[id_column] == current_id]["split"].tolist()[0]
369
+ candidate_df = df[df["split"] == split]
370
+
371
+ # Sample 3 random rows from the dataframe not matching the current id
372
+ sampled_texts = candidate_df[candidate_df[id_column] != current_id].sample(n + 15)[text_column].tolist()
373
+
374
+ # join them all
375
+ text = " ".join(sampled_texts)
376
+
377
+ # Replace all newlines with spaces...
378
+ text_no_newline = re.sub("\n", " ", text).lower()
379
+
380
+ # Split on punctuation
381
+ split_text = re.split('[?.!]', text_no_newline)
382
+
383
+ # Remove all empty lines
384
+ filtered_text = [x.strip() for x in split_text if len(x.strip()) > 1]
385
+
386
+ # Shuffle the list
387
+ return np.random.choice(filtered_text, n).tolist()
counsel-chat-master/utils_test.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing imprt List
2
+ import unittest
3
+
4
+ import ddt
5
+ from ddt import unpack, data
6
+ import numpy as np
7
+ import pandas as pd
8
+ from transformers import OpenAIGPTTokenizer
9
+
10
+ from utils import sample_candidates, convert_df_to_conv_ai_dict
11
+
12
+
13
+ # fmt: off
14
+ class UtilsTest(unittest.TestCase):
15
+
16
+ def test_sample_candidates(self):
17
+ # Test that split id doesn't show up
18
+ # Create random DF
19
+ df = pd.DataFrame(data, columns=["questionID", "answerText", 'split'])
20
+
21
+ for i in range(5):
22
+ candidates = sample_candidates(df, , "questionID", "answerText", n_candidates)
23
+ # Check that the samples don't come from the true data
24
+
25
+ def test_fuzz_convert_df_to_conv_ai_dict(self):
26
+ df = pd.read_csv("data/20200325_counsel_chat.csv")
27
+ df = df[df["split"] == "train"]
28
+ tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
29
+ for i in range(5):
30
+ temp_df = df.sample(100)
31
+ max_tokens = np.random.randint(1, 200)
32
+ n_candidates = np.random.randint(1, 10)
33
+ d = convert_df_to_conv_ai_dict(temp_df,
34
+ [""],
35
+ ["answerText"],
36
+ tokenizer,
37
+ max_tokens=max_tokens,
38
+ n_candidates=n_candidates)
39
+
40
+ # Test max length
41
+ self.assertLessEqual(max([len(x["utterances"][0]["history"][0].split()) for x in d["train"]]), max_tokens)
42
+
43
+ # Test n_candidates is equal to the number in the candidates list plus the one true response.
44
+ train_lengths = [len(x["utterances"][0]["candidates"]) for x in d["train"]]
45
+ self.assertEqual(n_candidates + 1, max(train_lengths))
46
+ self.assertEqual(n_candidates + 1, min(train_lengths))
47
+
48
+
49
+ if __name__ == "__main__":
50
+ unittest.main()
counselchat-convai/added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<bos>": 40478, "<eos>": 40479, "<pad>": 40480, "<speaker1>": 40481, "<speaker2>": 40482}
counselchat-convai/config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "afn": "gelu",
3
+ "architectures": [
4
+ "OpenAIGPTLMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": null,
8
+ "do_sample": false,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_ids": null,
11
+ "finetuning_task": null,
12
+ "id2label": {
13
+ "0": "LABEL_0",
14
+ "1": "LABEL_1"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "is_decoder": false,
18
+ "label2id": {
19
+ "LABEL_0": 0,
20
+ "LABEL_1": 1
21
+ },
22
+ "layer_norm_epsilon": 1e-05,
23
+ "length_penalty": 1.0,
24
+ "max_length": 20,
25
+ "model_type": "openai-gpt",
26
+ "n_ctx": 512,
27
+ "n_embd": 768,
28
+ "n_head": 12,
29
+ "n_layer": 12,
30
+ "n_positions": 512,
31
+ "n_special": 0,
32
+ "num_beams": 1,
33
+ "num_labels": 1,
34
+ "num_return_sequences": 1,
35
+ "output_attentions": false,
36
+ "output_hidden_states": false,
37
+ "output_past": true,
38
+ "pad_token_id": null,
39
+ "predict_special_tokens": true,
40
+ "pruned_heads": {},
41
+ "repetition_penalty": 1.0,
42
+ "resid_pdrop": 0.1,
43
+ "summary_activation": null,
44
+ "summary_first_dropout": 0.1,
45
+ "summary_proj_to_labels": true,
46
+ "summary_type": "cls_index",
47
+ "summary_use_proj": true,
48
+ "temperature": 1.0,
49
+ "top_k": 50,
50
+ "top_p": 1.0,
51
+ "torchscript": false,
52
+ "use_bfloat16": false,
53
+ "vocab_size": 40483
54
+ }
counselchat-convai/events.out.tfevents.1586138049.3b21657f0ca9 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2c720c161121fb6a7457b091cdfb68090b7a55ab9dc7090a4d93a9791077c5b
3
+ size 140048
counselchat-convai/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
counselchat-convai/model_training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65010fa713a71eac503cd4522be309b75f2874212050c38a780c5c7d7b73dfa9
3
+ size 729
counselchat-convai/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd9ab879baa4977df5caba66f5b923729f5618aee99b7aa3f5fe244a9e59150d
3
+ size 478771216
counselchat-convai/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<bos>", "eos_token": "<eos>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": ["<speaker1>", "<speaker2>"]}
counselchat-convai/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"max_len": 512}
counselchat-convai/vocab.json ADDED
The diff for this file is too large to render. See raw diff