Upload 23 files
Browse files- counsel-chat-master/.gitignore +4 -0
- counsel-chat-master/LICENSE +21 -0
- counsel-chat-master/Makefile +31 -0
- counsel-chat-master/README.md +25 -0
- counsel-chat-master/counsel_chat.ipynb +0 -0
- counsel-chat-master/data/20200325_counsel_chat.csv +0 -0
- counsel-chat-master/data/counsel_chat_250-tokens_full.json +0 -0
- counsel-chat-master/data/counselchat-data.csv +0 -0
- counsel-chat-master/docker/Dockerfile +37 -0
- counsel-chat-master/docker/requirements.txt +10 -0
- counsel-chat-master/interact.py +258 -0
- counsel-chat-master/train.py +267 -0
- counsel-chat-master/utils.py +387 -0
- counsel-chat-master/utils_test.py +50 -0
- counselchat-convai/added_tokens.json +1 -0
- counselchat-convai/config.json +54 -0
- counselchat-convai/events.out.tfevents.1586138049.3b21657f0ca9 +3 -0
- counselchat-convai/merges.txt +0 -0
- counselchat-convai/model_training_args.bin +3 -0
- counselchat-convai/pytorch_model.bin +3 -0
- counselchat-convai/special_tokens_map.json +1 -0
- counselchat-convai/tokenizer_config.json +1 -0
- counselchat-convai/vocab.json +0 -0
counsel-chat-master/.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.DS_Store
|
2 |
+
__pycache__
|
3 |
+
.zip
|
4 |
+
.idea
|
counsel-chat-master/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2020 nbertagnolli
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
counsel-chat-master/Makefile
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Interact Variables
|
2 |
+
# defaults
|
3 |
+
CHECKPOINT_DIR = "counselchat-convai"
|
4 |
+
DATA = "data/counsel_chat_250-tokens_full.json"
|
5 |
+
TEMPERATURE = 0.7
|
6 |
+
MAX_LENGTH = 30
|
7 |
+
MAX_HISTORY = 0
|
8 |
+
|
9 |
+
# HELP
|
10 |
+
# This will output the help for each task
|
11 |
+
# thanks to https://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
|
12 |
+
.PHONY: help
|
13 |
+
|
14 |
+
help: ## This help.
|
15 |
+
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
|
16 |
+
|
17 |
+
.DEFAULT_GOAL := help
|
18 |
+
|
19 |
+
|
20 |
+
# DOCKER TASKS
|
21 |
+
# Build the container
|
22 |
+
build:
|
23 |
+
docker system prune && \
|
24 |
+
docker build -t convai -f docker/Dockerfile .
|
25 |
+
|
26 |
+
## Interact with the model
|
27 |
+
interact:
|
28 |
+
docker run -it --rm -v $(shell pwd):$(shell pwd) convai:latest \
|
29 |
+
/bin/sh -c 'cd $(shell pwd); \
|
30 |
+
python3 interact.py --model_checkpoint $(CHECKPOINT_DIR) --temperature $(TEMPERATURE) --max_length $(MAX_LENGTH) --max_history $(MAX_HISTORY) --dataset_path $(DATA); \
|
31 |
+
'
|
counsel-chat-master/README.md
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# counsel-chat
|
2 |
+
This repository holds the code for working with data from counselchat.com. The scarped data are from individiuals seeking assistance from licensed therapists and their associated responses. The goal is to provide a high quality open source dataset of quality counseling responses.
|
3 |
+
|
4 |
+
I've recently added the data to [HuggingFace](https://huggingface.co/datasets/nbertagnolli/counsel-chat) so getting the data should be as easy as:
|
5 |
+
|
6 |
+
```python
|
7 |
+
from datasets import load_dataset
|
8 |
+
|
9 |
+
dataset = load_dataset("nbertagnolli/counsel-chat")
|
10 |
+
```
|
11 |
+
|
12 |
+
There is a larger writeup available on [medium](https://medium.com/towards-data-science/counsel-chat-bootstrapping-high-quality-therapy-data-971b419f33da)
|
13 |
+
|
14 |
+
If you use this data in your work please cite the medium article.
|
15 |
+
|
16 |
+
```
|
17 |
+
@misc{bertagnolli2020counsel,
|
18 |
+
title={Counsel chat: Bootstrapping high-quality therapy data},
|
19 |
+
author={Bertagnolli, Nicolas},
|
20 |
+
year={2020},
|
21 |
+
publisher={Towards Data Science. https://towardsdatascience. com/counsel-chat~…}
|
22 |
+
}
|
23 |
+
```
|
24 |
+
|
25 |
+
|
counsel-chat-master/counsel_chat.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
counsel-chat-master/data/20200325_counsel_chat.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
counsel-chat-master/data/counsel_chat_250-tokens_full.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
counsel-chat-master/data/counselchat-data.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
counsel-chat-master/docker/Dockerfile
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM ubuntu:18.04
|
2 |
+
|
3 |
+
MAINTAINER Loreto Parisi [email protected]
|
4 |
+
|
5 |
+
######################################## BASE SYSTEM
|
6 |
+
# set noninteractive installation
|
7 |
+
ARG DEBIAN_FRONTEND=noninteractive
|
8 |
+
RUN apt-get update && apt-get install -y apt-utils
|
9 |
+
RUN apt-get install -y --no-install-recommends \
|
10 |
+
build-essential \
|
11 |
+
pkg-config \
|
12 |
+
tzdata \
|
13 |
+
curl
|
14 |
+
|
15 |
+
######################################## PYTHON3
|
16 |
+
RUN apt-get install -y \
|
17 |
+
python3 \
|
18 |
+
python3-pip
|
19 |
+
|
20 |
+
# set local timezone
|
21 |
+
RUN ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \
|
22 |
+
dpkg-reconfigure --frontend noninteractive tzdata
|
23 |
+
|
24 |
+
# transfer-learning-conv-ai
|
25 |
+
ENV PYTHONPATH /usr/local/lib/python3.6
|
26 |
+
COPY . .
|
27 |
+
COPY docker/requirements.txt /tmp/requirements.txt
|
28 |
+
RUN pip3 install --upgrade pip && pip3 install -r /tmp/requirements.txt
|
29 |
+
|
30 |
+
# model zoo
|
31 |
+
RUN mkdir models && \
|
32 |
+
curl https://s3.amazonaws.com/models.huggingface.co/transfer-learning-chatbot/finetuned_chatbot_gpt.tar.gz > models/finetuned_chatbot_gpt.tar.gz && \
|
33 |
+
cd models/ && \
|
34 |
+
tar -xvzf finetuned_chatbot_gpt.tar.gz && \
|
35 |
+
rm finetuned_chatbot_gpt.tar.gz
|
36 |
+
|
37 |
+
CMD ["bash"]
|
counsel-chat-master/docker/requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy
|
2 |
+
torch
|
3 |
+
pytorch-ignite
|
4 |
+
transformers==2.5.1
|
5 |
+
tensorboardX==1.8
|
6 |
+
jupyter
|
7 |
+
scikit-learn
|
8 |
+
pandas
|
9 |
+
dill
|
10 |
+
scipy
|
counsel-chat-master/interact.py
ADDED
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# # Copyright (c) 2019-present, HuggingFace Inc.
|
2 |
+
# All rights reserved.
|
3 |
+
# This source code is licensed under the BSD-style license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
import logging
|
6 |
+
import json
|
7 |
+
import os
|
8 |
+
import random
|
9 |
+
from argparse import ArgumentParser
|
10 |
+
from itertools import chain
|
11 |
+
from pprint import pformat
|
12 |
+
import tempfile
|
13 |
+
import tarfile
|
14 |
+
import warnings
|
15 |
+
|
16 |
+
import torch
|
17 |
+
import torch.nn.functional as F
|
18 |
+
from transformers import cached_path
|
19 |
+
|
20 |
+
from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, GPT2LMHeadModel, \
|
21 |
+
GPT2Tokenizer
|
22 |
+
|
23 |
+
HF_FINETUNED_MODEL = "https://s3.amazonaws.com/models.huggingface.co/transfer-learning-chatbot/gpt_personachat_cache.tar.gz"
|
24 |
+
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
|
25 |
+
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
|
26 |
+
'additional_special_tokens': ['<speaker1>', '<speaker2>']}
|
27 |
+
|
28 |
+
|
29 |
+
def download_pretrained_model():
|
30 |
+
""" Download and extract finetuned model from S3 """
|
31 |
+
resolved_archive_file = cached_path(HF_FINETUNED_MODEL)
|
32 |
+
tempdir = tempfile.mkdtemp()
|
33 |
+
with tarfile.open(resolved_archive_file, 'r:gz') as archive:
|
34 |
+
def is_within_directory(directory, target):
|
35 |
+
|
36 |
+
abs_directory = os.path.abspath(directory)
|
37 |
+
abs_target = os.path.abspath(target)
|
38 |
+
|
39 |
+
prefix = os.path.commonprefix([abs_directory, abs_target])
|
40 |
+
|
41 |
+
return prefix == abs_directory
|
42 |
+
|
43 |
+
def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
|
44 |
+
|
45 |
+
for member in tar.getmembers():
|
46 |
+
member_path = os.path.join(path, member.name)
|
47 |
+
if not is_within_directory(path, member_path):
|
48 |
+
raise Exception("Attempted Path Traversal in Tar File")
|
49 |
+
|
50 |
+
tar.extractall(path, members, numeric_owner=numeric_owner)
|
51 |
+
|
52 |
+
|
53 |
+
safe_extract(archive, tempdir)
|
54 |
+
return tempdir
|
55 |
+
|
56 |
+
|
57 |
+
def get_dataset(tokenizer, dataset_path, dataset_cache):
|
58 |
+
""" Get tokenized PERSONACHAT dataset from S3 or cache."""
|
59 |
+
dataset_path = dataset_path
|
60 |
+
dataset_cache = dataset_cache + '_' + type(tokenizer).__name__ # To avoid using GPT cache for GPT-2 and vice-versa
|
61 |
+
if dataset_cache and os.path.isfile(dataset_cache):
|
62 |
+
dataset = torch.load(dataset_cache)
|
63 |
+
else:
|
64 |
+
personachat_file = cached_path(dataset_path)
|
65 |
+
with open(personachat_file, "r", encoding="utf-8") as f:
|
66 |
+
dataset = json.loads(f.read())
|
67 |
+
|
68 |
+
def tokenize(obj):
|
69 |
+
if isinstance(obj, str):
|
70 |
+
return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
|
71 |
+
if isinstance(obj, dict):
|
72 |
+
return dict((n, tokenize(o)) for n, o in obj.items())
|
73 |
+
return list(tokenize(o) for o in obj)
|
74 |
+
dataset = tokenize(dataset)
|
75 |
+
torch.save(dataset, dataset_cache)
|
76 |
+
return dataset
|
77 |
+
|
78 |
+
|
79 |
+
def add_special_tokens_(model, tokenizer):
|
80 |
+
""" Add special tokens to the tokenizer and the model if they have not already been added. """
|
81 |
+
orig_num_tokens = len(tokenizer.encoder)
|
82 |
+
num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there
|
83 |
+
if num_added_tokens > 0:
|
84 |
+
model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens)
|
85 |
+
|
86 |
+
|
87 |
+
def build_input_from_segments(persona, history, reply, tokenizer, lm_labels=False, with_eos=True):
|
88 |
+
""" Build a sequence of input from 3 segments: persona, history and last reply. """
|
89 |
+
bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
|
90 |
+
sequence = [[bos] + list(chain(*persona))] + history + [reply + ([eos] if with_eos else [])]
|
91 |
+
sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])]
|
92 |
+
instance = {}
|
93 |
+
instance["input_ids"] = list(chain(*sequence))
|
94 |
+
instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
|
95 |
+
instance["mc_token_ids"] = len(instance["input_ids"]) - 1
|
96 |
+
instance["lm_labels"] = [-100] * len(instance["input_ids"])
|
97 |
+
if lm_labels:
|
98 |
+
instance["lm_labels"] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]
|
99 |
+
return instance
|
100 |
+
|
101 |
+
|
102 |
+
def top_filtering(logits, top_k=0., top_p=0.9, threshold=-float('Inf'),
|
103 |
+
filter_value=-float('Inf')):
|
104 |
+
""" Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering
|
105 |
+
Args:
|
106 |
+
logits: logits distribution shape (vocabulary size)
|
107 |
+
top_k: <=0: no filtering, >0: keep only top k tokens with highest probability.
|
108 |
+
top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset
|
109 |
+
whose total probability mass is greater than or equal to the threshold top_p.
|
110 |
+
In practice, we select the highest probability tokens whose cumulative probability mass exceeds
|
111 |
+
the threshold top_p.
|
112 |
+
threshold: a minimal threshold to keep logits
|
113 |
+
"""
|
114 |
+
assert logits.dim() == 1 # Only work for batch size 1 for now - could update but it would obfuscate a bit the code
|
115 |
+
top_k = min(top_k, logits.size(-1))
|
116 |
+
if top_k > 0:
|
117 |
+
# Remove all tokens with a probability less than the last token in the top-k tokens
|
118 |
+
indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
|
119 |
+
logits[indices_to_remove] = filter_value
|
120 |
+
|
121 |
+
if top_p > 0.0:
|
122 |
+
# Compute cumulative probabilities of sorted tokens
|
123 |
+
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
|
124 |
+
cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
|
125 |
+
|
126 |
+
# Remove tokens with cumulative probability above the threshold
|
127 |
+
sorted_indices_to_remove = cumulative_probabilities > top_p
|
128 |
+
# Shift the indices to the right to keep also the first token above the threshold
|
129 |
+
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
|
130 |
+
sorted_indices_to_remove[..., 0] = 0
|
131 |
+
|
132 |
+
# Back to unsorted indices and set them to -infinity
|
133 |
+
indices_to_remove = sorted_indices[sorted_indices_to_remove]
|
134 |
+
logits[indices_to_remove] = filter_value
|
135 |
+
|
136 |
+
indices_to_remove = logits < threshold
|
137 |
+
logits[indices_to_remove] = filter_value
|
138 |
+
|
139 |
+
return logits
|
140 |
+
|
141 |
+
|
142 |
+
def sample_sequence(personality, history, tokenizer, model, args, current_output=None):
|
143 |
+
special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)
|
144 |
+
if current_output is None:
|
145 |
+
current_output = []
|
146 |
+
|
147 |
+
for i in range(args.max_length):
|
148 |
+
instance = build_input_from_segments(personality, history, current_output,
|
149 |
+
tokenizer, with_eos=False)
|
150 |
+
|
151 |
+
input_ids = torch.tensor(instance["input_ids"], device=args.device).unsqueeze(0)
|
152 |
+
token_type_ids = torch.tensor(instance["token_type_ids"],
|
153 |
+
device=args.device).unsqueeze(0)
|
154 |
+
|
155 |
+
logits = model(input_ids, token_type_ids=token_type_ids)
|
156 |
+
if isinstance(logits, tuple): # for gpt2 and maybe others
|
157 |
+
logits = logits[0]
|
158 |
+
logits = logits[0, -1, :] / args.temperature
|
159 |
+
logits = top_filtering(logits, top_k=args.top_k, top_p=args.top_p)
|
160 |
+
probs = F.softmax(logits, dim=-1)
|
161 |
+
|
162 |
+
prev = torch.topk(probs, 1)[1] if args.no_sample else torch.multinomial(probs, 1)
|
163 |
+
if i < args.min_length and prev.item() in special_tokens_ids:
|
164 |
+
while prev.item() in special_tokens_ids:
|
165 |
+
if probs.max().item() == 1:
|
166 |
+
warnings.warn(
|
167 |
+
"Warning: model generating special token with probability 1.")
|
168 |
+
break # avoid infinitely looping over special token
|
169 |
+
prev = torch.multinomial(probs, num_samples=1)
|
170 |
+
|
171 |
+
if prev.item() in special_tokens_ids:
|
172 |
+
break
|
173 |
+
current_output.append(prev.item())
|
174 |
+
|
175 |
+
return current_output
|
176 |
+
|
177 |
+
|
178 |
+
def run():
|
179 |
+
parser = ArgumentParser()
|
180 |
+
parser.add_argument("--dataset_path", type=str, default="",
|
181 |
+
help="Path or url of the dataset. If empty download from S3.")
|
182 |
+
parser.add_argument("--dataset_cache", type=str, default='./dataset_cache',
|
183 |
+
help="Path or url of the dataset cache")
|
184 |
+
parser.add_argument("--model", type=str, default="openai-gpt",
|
185 |
+
help="Model type (openai-gpt or gpt2)", choices=['openai-gpt',
|
186 |
+
'gpt2']) # anything besides gpt2 will load openai-gpt
|
187 |
+
parser.add_argument("--model_checkpoint", type=str, default="",
|
188 |
+
help="Path, url or short name of the model")
|
189 |
+
parser.add_argument("--max_history", type=int, default=2,
|
190 |
+
help="Number of previous utterances to keep in history")
|
191 |
+
parser.add_argument("--device", type=str,
|
192 |
+
default="cuda" if torch.cuda.is_available() else "cpu",
|
193 |
+
help="Device (cuda or cpu)")
|
194 |
+
|
195 |
+
parser.add_argument("--no_sample", action='store_true',
|
196 |
+
help="Set to use greedy decoding instead of sampling")
|
197 |
+
parser.add_argument("--max_length", type=int, default=20,
|
198 |
+
help="Maximum length of the output utterances")
|
199 |
+
parser.add_argument("--min_length", type=int, default=1,
|
200 |
+
help="Minimum length of the output utterances")
|
201 |
+
parser.add_argument("--seed", type=int, default=0, help="Seed")
|
202 |
+
parser.add_argument("--temperature", type=float, default=0.7,
|
203 |
+
help="Sampling softmax temperature")
|
204 |
+
parser.add_argument("--top_k", type=int, default=0,
|
205 |
+
help="Filter top-k tokens before sampling (<=0: no filtering)")
|
206 |
+
parser.add_argument("--top_p", type=float, default=0.9,
|
207 |
+
help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
|
208 |
+
args = parser.parse_args()
|
209 |
+
|
210 |
+
logging.basicConfig(level=logging.INFO)
|
211 |
+
logger = logging.getLogger(__file__)
|
212 |
+
logger.info(pformat(args))
|
213 |
+
|
214 |
+
if args.model_checkpoint == "":
|
215 |
+
if args.model == 'gpt2':
|
216 |
+
raise ValueError(
|
217 |
+
"Interacting with GPT2 requires passing a finetuned model_checkpoint")
|
218 |
+
else:
|
219 |
+
args.model_checkpoint = download_pretrained_model()
|
220 |
+
|
221 |
+
if args.seed != 0:
|
222 |
+
random.seed(args.seed)
|
223 |
+
torch.random.manual_seed(args.seed)
|
224 |
+
torch.cuda.manual_seed(args.seed)
|
225 |
+
|
226 |
+
logger.info("Get pretrained model and tokenizer")
|
227 |
+
tokenizer_class, model_class = (
|
228 |
+
GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (
|
229 |
+
OpenAIGPTTokenizer, OpenAIGPTLMHeadModel)
|
230 |
+
tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
|
231 |
+
model = model_class.from_pretrained(args.model_checkpoint)
|
232 |
+
model.to(args.device)
|
233 |
+
add_special_tokens_(model, tokenizer)
|
234 |
+
|
235 |
+
logger.info("Sample a personality")
|
236 |
+
dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)
|
237 |
+
personalities = [dialog["personality"] for dataset in dataset.values() for dialog in
|
238 |
+
dataset]
|
239 |
+
personality = random.choice(personalities)
|
240 |
+
logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))
|
241 |
+
|
242 |
+
history = []
|
243 |
+
while True:
|
244 |
+
raw_text = input(">>> ")
|
245 |
+
while not raw_text:
|
246 |
+
print('Prompt should not be empty!')
|
247 |
+
raw_text = input(">>> ")
|
248 |
+
history.append(tokenizer.encode(raw_text))
|
249 |
+
with torch.no_grad():
|
250 |
+
out_ids = sample_sequence(personality, history, tokenizer, model, args)
|
251 |
+
history.append(out_ids)
|
252 |
+
history = history[-(2 * args.max_history + 1):]
|
253 |
+
out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
|
254 |
+
print(out_text)
|
255 |
+
|
256 |
+
|
257 |
+
if __name__ == "__main__":
|
258 |
+
run()
|
counsel-chat-master/train.py
ADDED
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2019-present, HuggingFace Inc.
|
2 |
+
# All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree.
|
3 |
+
import os
|
4 |
+
import math
|
5 |
+
import logging
|
6 |
+
from pprint import pformat
|
7 |
+
from argparse import ArgumentParser
|
8 |
+
from collections import defaultdict
|
9 |
+
from itertools import chain
|
10 |
+
|
11 |
+
import torch
|
12 |
+
from torch.nn.parallel import DistributedDataParallel
|
13 |
+
from torch.utils.data import DataLoader, TensorDataset
|
14 |
+
from ignite.engine import Engine, Events
|
15 |
+
from ignite.handlers import ModelCheckpoint
|
16 |
+
from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage
|
17 |
+
from ignite.contrib.handlers import ProgressBar, PiecewiseLinear
|
18 |
+
from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler
|
19 |
+
from transformers import (AdamW, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
|
20 |
+
GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME)
|
21 |
+
|
22 |
+
from utils import get_dataset, make_logdir
|
23 |
+
|
24 |
+
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
|
25 |
+
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
|
26 |
+
'additional_special_tokens': ['<speaker1>', '<speaker2>']}
|
27 |
+
MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"]
|
28 |
+
PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]
|
29 |
+
|
30 |
+
logger = logging.getLogger(__file__)
|
31 |
+
|
32 |
+
def average_distributed_scalar(scalar, args):
|
33 |
+
""" Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """
|
34 |
+
if args.local_rank == -1:
|
35 |
+
return scalar
|
36 |
+
scalar_t = torch.tensor(scalar, dtype=torch.float, device=args.device) / torch.distributed.get_world_size()
|
37 |
+
torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM)
|
38 |
+
return scalar_t.item()
|
39 |
+
|
40 |
+
|
41 |
+
def pad_dataset(dataset, padding=0):
|
42 |
+
""" Pad the dataset. This could be optimized by defining a Dataset class and padding at the batch level, but this is simpler. """
|
43 |
+
max_l = max(len(x) for x in dataset["input_ids"])
|
44 |
+
for name in PADDED_INPUTS:
|
45 |
+
dataset[name] = [x + [padding if name != "lm_labels" else -100] * (max_l - len(x)) for x in dataset[name]]
|
46 |
+
return dataset
|
47 |
+
|
48 |
+
|
49 |
+
def add_special_tokens_(model, tokenizer):
|
50 |
+
""" Add special tokens to the tokenizer and the model if they have not already been added. """
|
51 |
+
orig_num_tokens = len(tokenizer.encoder)
|
52 |
+
num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there
|
53 |
+
if num_added_tokens > 0:
|
54 |
+
model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens)
|
55 |
+
|
56 |
+
def build_input_from_segments(persona, history, reply, tokenizer, lm_labels=False, with_eos=True):
|
57 |
+
""" Build a sequence of input from 3 segments: persona, history and last reply. """
|
58 |
+
bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
|
59 |
+
sequence = [[bos] + list(chain(*persona))] + history + [reply + ([eos] if with_eos else [])]
|
60 |
+
sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])]
|
61 |
+
instance = {}
|
62 |
+
instance["input_ids"] = list(chain(*sequence))
|
63 |
+
instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
|
64 |
+
instance["mc_token_ids"] = len(instance["input_ids"]) - 1
|
65 |
+
instance["lm_labels"] = [-100] * len(instance["input_ids"])
|
66 |
+
if lm_labels:
|
67 |
+
instance["lm_labels"] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]
|
68 |
+
return instance
|
69 |
+
|
70 |
+
|
71 |
+
def get_data_loaders(args, tokenizer):
|
72 |
+
""" Prepare the dataset for training and evaluation """
|
73 |
+
personachat = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)
|
74 |
+
|
75 |
+
logger.info("Build inputs and labels")
|
76 |
+
datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
|
77 |
+
for dataset_name, dataset in personachat.items():
|
78 |
+
num_candidates = len(dataset[0]["utterances"][0]["candidates"])
|
79 |
+
if args.num_candidates > 0 and dataset_name == 'train':
|
80 |
+
num_candidates = min(args.num_candidates, num_candidates)
|
81 |
+
for dialog in dataset:
|
82 |
+
persona = dialog["personality"].copy()
|
83 |
+
for _ in range(args.personality_permutations):
|
84 |
+
for utterance in dialog["utterances"]:
|
85 |
+
history = utterance["history"][-(2*args.max_history+1):]
|
86 |
+
for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
|
87 |
+
lm_labels = bool(j == num_candidates-1)
|
88 |
+
instance = build_input_from_segments(persona, history, candidate, tokenizer, lm_labels)
|
89 |
+
for input_name, input_array in instance.items():
|
90 |
+
datasets[dataset_name][input_name].append(input_array)
|
91 |
+
datasets[dataset_name]["mc_labels"].append(num_candidates - 1)
|
92 |
+
datasets[dataset_name]["n_candidates"] = num_candidates
|
93 |
+
persona = [persona[-1]] + persona[:-1] # permuted personalities
|
94 |
+
|
95 |
+
logger.info("Pad inputs and convert to Tensor")
|
96 |
+
tensor_datasets = {"train": [], "valid": []}
|
97 |
+
for dataset_name, dataset in datasets.items():
|
98 |
+
dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
|
99 |
+
for input_name in MODEL_INPUTS:
|
100 |
+
tensor = torch.tensor(dataset[input_name])
|
101 |
+
if input_name != "mc_labels":
|
102 |
+
tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
|
103 |
+
tensor_datasets[dataset_name].append(tensor)
|
104 |
+
|
105 |
+
logger.info("Build train and validation dataloaders")
|
106 |
+
train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
|
107 |
+
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None
|
108 |
+
valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None
|
109 |
+
train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, shuffle=(not args.distributed))
|
110 |
+
valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, shuffle=False)
|
111 |
+
|
112 |
+
logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape))
|
113 |
+
logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape))
|
114 |
+
return train_loader, valid_loader, train_sampler, valid_sampler
|
115 |
+
|
116 |
+
|
117 |
+
def train():
|
118 |
+
parser = ArgumentParser()
|
119 |
+
parser.add_argument("--dataset_path", type=str, default="/Users/tetracycline/repos/datascience/datascience/projects/counsel_chat_all_data_300-tokens.json", help="Path or url of the dataset. If empty download from S3.")
|
120 |
+
parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
|
121 |
+
parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model")
|
122 |
+
parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training")
|
123 |
+
parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history")
|
124 |
+
parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training")
|
125 |
+
parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation")
|
126 |
+
parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps")
|
127 |
+
parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate")
|
128 |
+
parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient")
|
129 |
+
parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient")
|
130 |
+
parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm")
|
131 |
+
parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs")
|
132 |
+
parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences")
|
133 |
+
parser.add_argument("--eval_before_start", action='store_true', help="If true start with a first evaluation before training")
|
134 |
+
parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
|
135 |
+
parser.add_argument("--fp16", type=str, default="", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
|
136 |
+
parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)")
|
137 |
+
args = parser.parse_args()
|
138 |
+
|
139 |
+
# logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
|
140 |
+
logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
|
141 |
+
logger.warning("Running process %d", args.local_rank) # This is a logger.warning: it will be printed by all distributed processes
|
142 |
+
logger.info("Arguments: %s", pformat(args))
|
143 |
+
|
144 |
+
# Initialize distributed training if needed
|
145 |
+
args.distributed = (args.local_rank != -1)
|
146 |
+
if args.distributed:
|
147 |
+
torch.cuda.set_device(args.local_rank)
|
148 |
+
args.device = torch.device("cuda", args.local_rank)
|
149 |
+
torch.distributed.init_process_group(backend='nccl', init_method='env://')
|
150 |
+
|
151 |
+
logger.info("Prepare tokenizer, pretrained model and optimizer.")
|
152 |
+
tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path
|
153 |
+
tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
|
154 |
+
|
155 |
+
|
156 |
+
model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel
|
157 |
+
model = model_class.from_pretrained(args.model_checkpoint)
|
158 |
+
model.to(args.device)
|
159 |
+
# Add special tokens if they are not already added
|
160 |
+
add_special_tokens_(model, tokenizer)
|
161 |
+
optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)
|
162 |
+
|
163 |
+
# Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
|
164 |
+
if args.fp16:
|
165 |
+
from apex import amp # Apex is only required if we use fp16 training
|
166 |
+
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16)
|
167 |
+
if args.distributed:
|
168 |
+
model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)
|
169 |
+
|
170 |
+
logger.info("Prepare datasets")
|
171 |
+
train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer)
|
172 |
+
|
173 |
+
# Training function and trainer
|
174 |
+
def update(engine, batch):
|
175 |
+
model.train()
|
176 |
+
batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
|
177 |
+
input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
|
178 |
+
(lm_loss), (mc_loss), *_ = model(
|
179 |
+
input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
|
180 |
+
mc_labels=mc_labels, lm_labels=lm_labels
|
181 |
+
)
|
182 |
+
loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps
|
183 |
+
if args.fp16:
|
184 |
+
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
185 |
+
scaled_loss.backward()
|
186 |
+
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm)
|
187 |
+
else:
|
188 |
+
loss.backward()
|
189 |
+
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
|
190 |
+
if engine.state.iteration % args.gradient_accumulation_steps == 0:
|
191 |
+
optimizer.step()
|
192 |
+
optimizer.zero_grad()
|
193 |
+
return loss.item()
|
194 |
+
trainer = Engine(update)
|
195 |
+
|
196 |
+
# Evaluation function and evaluator (evaluator output is the input of the metrics)
|
197 |
+
def inference(engine, batch):
|
198 |
+
model.eval()
|
199 |
+
with torch.no_grad():
|
200 |
+
batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
|
201 |
+
input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
|
202 |
+
logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
|
203 |
+
# if we dont send labels to model, it doesnt return losses
|
204 |
+
lm_logits, mc_logits, *_ = model(
|
205 |
+
input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
|
206 |
+
)
|
207 |
+
lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
|
208 |
+
lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
|
209 |
+
return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)
|
210 |
+
evaluator = Engine(inference)
|
211 |
+
|
212 |
+
# Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
|
213 |
+
trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
|
214 |
+
if args.n_epochs < 1:
|
215 |
+
trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
|
216 |
+
if args.eval_before_start:
|
217 |
+
trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))
|
218 |
+
|
219 |
+
# Make sure distributed data samplers split the dataset nicely between the distributed processes
|
220 |
+
if args.distributed:
|
221 |
+
trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
|
222 |
+
evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))
|
223 |
+
|
224 |
+
# Linearly decrease the learning rate from lr to zero
|
225 |
+
scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
|
226 |
+
trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
|
227 |
+
|
228 |
+
# Prepare metrics - note how we compute distributed metrics
|
229 |
+
RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
|
230 |
+
metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])),
|
231 |
+
"accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
|
232 |
+
metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args),
|
233 |
+
"average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)})
|
234 |
+
metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
|
235 |
+
for name, metric in metrics.items():
|
236 |
+
metric.attach(evaluator, name)
|
237 |
+
|
238 |
+
# On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
|
239 |
+
if args.local_rank in [-1, 0]:
|
240 |
+
pbar = ProgressBar(persist=True)
|
241 |
+
pbar.attach(trainer, metric_names=["loss"])
|
242 |
+
evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))
|
243 |
+
|
244 |
+
log_dir = make_logdir(args.model_checkpoint)
|
245 |
+
tb_logger = TensorboardLogger(log_dir)
|
246 |
+
|
247 |
+
tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
|
248 |
+
tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
|
249 |
+
tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)
|
250 |
+
|
251 |
+
checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3)
|
252 |
+
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" takes care of distributed encapsulation
|
253 |
+
|
254 |
+
torch.save(args, log_dir + '/model_training_args.bin')
|
255 |
+
getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))
|
256 |
+
tokenizer.save_pretrained(log_dir)
|
257 |
+
|
258 |
+
# Run the training
|
259 |
+
trainer.run(train_loader, max_epochs=args.n_epochs)
|
260 |
+
|
261 |
+
# On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
|
262 |
+
if args.local_rank in [-1, 0] and args.n_epochs > 0:
|
263 |
+
os.rename(os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner)
|
264 |
+
tb_logger.close()
|
265 |
+
|
266 |
+
if __name__ == "__main__":
|
267 |
+
train()
|
counsel-chat-master/utils.py
ADDED
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Any, Callable, List, Tuple, Optional, Union
|
2 |
+
import torch
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import seaborn as sns
|
7 |
+
from sklearn import metrics
|
8 |
+
from sklearn.utils.multiclass import unique_labels
|
9 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
10 |
+
import re
|
11 |
+
import logging
|
12 |
+
import json
|
13 |
+
from datetime import datetime
|
14 |
+
import logging
|
15 |
+
import os
|
16 |
+
import tarfile
|
17 |
+
import tempfile
|
18 |
+
import socket
|
19 |
+
|
20 |
+
import torch
|
21 |
+
|
22 |
+
from transformers import cached_path
|
23 |
+
|
24 |
+
PERSONACHAT_URL = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"
|
25 |
+
HF_FINETUNED_MODEL = "https://s3.amazonaws.com/models.huggingface.co/transfer-learning-chatbot/gpt_personachat_cache.tar.gz"
|
26 |
+
|
27 |
+
logger = logging.getLogger(__file__)
|
28 |
+
|
29 |
+
def download_pretrained_model():
|
30 |
+
""" Download and extract finetuned model from S3 """
|
31 |
+
resolved_archive_file = cached_path(HF_FINETUNED_MODEL)
|
32 |
+
tempdir = tempfile.mkdtemp()
|
33 |
+
logger.info("extracting archive file {} to temp dir {}".format(resolved_archive_file, tempdir))
|
34 |
+
with tarfile.open(resolved_archive_file, 'r:gz') as archive:
|
35 |
+
def is_within_directory(directory, target):
|
36 |
+
|
37 |
+
abs_directory = os.path.abspath(directory)
|
38 |
+
abs_target = os.path.abspath(target)
|
39 |
+
|
40 |
+
prefix = os.path.commonprefix([abs_directory, abs_target])
|
41 |
+
|
42 |
+
return prefix == abs_directory
|
43 |
+
|
44 |
+
def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
|
45 |
+
|
46 |
+
for member in tar.getmembers():
|
47 |
+
member_path = os.path.join(path, member.name)
|
48 |
+
if not is_within_directory(path, member_path):
|
49 |
+
raise Exception("Attempted Path Traversal in Tar File")
|
50 |
+
|
51 |
+
tar.extractall(path, members, numeric_owner=numeric_owner)
|
52 |
+
|
53 |
+
|
54 |
+
safe_extract(archive, tempdir)
|
55 |
+
return tempdir
|
56 |
+
|
57 |
+
|
58 |
+
def get_dataset(tokenizer, dataset_path, dataset_cache):
|
59 |
+
""" Get tokenized PERSONACHAT dataset from S3 or cache."""
|
60 |
+
dataset_path = dataset_path or PERSONACHAT_URL
|
61 |
+
dataset_cache = dataset_cache + '_' + type(tokenizer).__name__ # To avoid using GPT cache for GPT-2 and vice-versa
|
62 |
+
if dataset_cache and os.path.isfile(dataset_cache):
|
63 |
+
logger.info("Load tokenized dataset from cache at %s", dataset_cache)
|
64 |
+
dataset = torch.load(dataset_cache)
|
65 |
+
else:
|
66 |
+
logger.info("Download dataset from %s", dataset_path)
|
67 |
+
personachat_file = cached_path(dataset_path)
|
68 |
+
with open(personachat_file, "r", encoding="utf-8") as f:
|
69 |
+
dataset = json.loads(f.read())
|
70 |
+
|
71 |
+
logger.info("Tokenize and encode the dataset")
|
72 |
+
def tokenize(obj):
|
73 |
+
if isinstance(obj, str):
|
74 |
+
return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
|
75 |
+
if isinstance(obj, dict):
|
76 |
+
return dict((n, tokenize(o)) for n, o in obj.items())
|
77 |
+
return list(tokenize(o) for o in obj)
|
78 |
+
dataset = tokenize(dataset)
|
79 |
+
torch.save(dataset, dataset_cache)
|
80 |
+
return dataset
|
81 |
+
|
82 |
+
|
83 |
+
class AttrDict(dict):
|
84 |
+
def __init__(self, *args, **kwargs):
|
85 |
+
super(AttrDict, self).__init__(*args, **kwargs)
|
86 |
+
self.__dict__ = self
|
87 |
+
|
88 |
+
|
89 |
+
def make_logdir(model_name: str):
|
90 |
+
"""Create unique path to save results and checkpoints, e.g. runs/Sep22_19-45-59_gpu-7_gpt2"""
|
91 |
+
# Code copied from ignite repo
|
92 |
+
current_time = datetime.now().strftime('%b%d_%H-%M-%S')
|
93 |
+
logdir = os.path.join(
|
94 |
+
'runs', current_time + '_' + socket.gethostname() + '_' + model_name)
|
95 |
+
return logdir
|
96 |
+
|
97 |
+
|
98 |
+
def calculate_classification_metrics(
|
99 |
+
y_true: np.array,
|
100 |
+
y_pred: np.array,
|
101 |
+
average: Optional[str] = None,
|
102 |
+
return_df: bool = True,
|
103 |
+
) -> Union[Dict[str, float], pd.DataFrame]:
|
104 |
+
"""Computes f1, precision, recall, kappa, accuracy, and support
|
105 |
+
|
106 |
+
Args:
|
107 |
+
y_true: The true labels
|
108 |
+
y_pred: The predicted labels
|
109 |
+
average: How to average multiclass results
|
110 |
+
return_df: Returns a dataframe if true otherwise a dictionary of performance
|
111 |
+
values.
|
112 |
+
|
113 |
+
Returns:
|
114 |
+
Either a dataframe of the performance metrics or a single dictionary
|
115 |
+
"""
|
116 |
+
labels = unique_labels(y_true, y_pred)
|
117 |
+
|
118 |
+
# get results
|
119 |
+
precision, recall, f_score, support = metrics.precision_recall_fscore_support(
|
120 |
+
y_true, y_pred, labels=labels, average=average
|
121 |
+
)
|
122 |
+
|
123 |
+
kappa = metrics.cohen_kappa_score(y_true, y_pred, labels=labels)
|
124 |
+
accuracy = metrics.accuracy_score(y_true, y_pred)
|
125 |
+
|
126 |
+
# create a pandas DataFrame
|
127 |
+
if return_df:
|
128 |
+
results = pd.DataFrame(
|
129 |
+
{
|
130 |
+
"class": labels,
|
131 |
+
"f_score": f_score,
|
132 |
+
"precision": precision,
|
133 |
+
"recall": recall,
|
134 |
+
"support": support,
|
135 |
+
"kappa": kappa,
|
136 |
+
"accuracy": accuracy,
|
137 |
+
}
|
138 |
+
)
|
139 |
+
else:
|
140 |
+
results = {
|
141 |
+
"f1": f_score,
|
142 |
+
"precision": precision,
|
143 |
+
"recall": recall,
|
144 |
+
"kappa": kappa,
|
145 |
+
"accuracy": accuracy,
|
146 |
+
}
|
147 |
+
|
148 |
+
return results
|
149 |
+
|
150 |
+
|
151 |
+
def visualize_performance(
|
152 |
+
df: pd.DataFrame,
|
153 |
+
metrics: List[str],
|
154 |
+
ax: Optional[Any] = None,
|
155 |
+
title: Optional[str] = None,
|
156 |
+
ylim: Optional[Tuple[float, float]] = None,
|
157 |
+
figsize: Optional[Tuple[int, int]] = None,
|
158 |
+
use_class_names: bool = True
|
159 |
+
) -> None:
|
160 |
+
"""Takes a Performance DF and converts it to a bar plot performance graph
|
161 |
+
|
162 |
+
Args:
|
163 |
+
df: A dataframe where each row is a class and each column is a metric
|
164 |
+
metrics: A list of metrics from the columns of df to plot
|
165 |
+
ax: A matplotlib axes object that we want to draw the plot on
|
166 |
+
title: The title of the plot
|
167 |
+
ylim: The minimum and maximum range for the yaxis.
|
168 |
+
figsize: The width and height of the figure. This does nothing if ax is set
|
169 |
+
use_class_names: This will label the x ticks with the class name in a multiclass setting.
|
170 |
+
"""
|
171 |
+
unstacked_df = (
|
172 |
+
df[metrics]
|
173 |
+
.T.unstack()
|
174 |
+
.reset_index()
|
175 |
+
.rename(
|
176 |
+
index=str, columns={"level_0": "class", "level_1": "metric", 0: "score"}
|
177 |
+
)
|
178 |
+
)
|
179 |
+
|
180 |
+
if use_class_names:
|
181 |
+
unstacked_df["class"] = unstacked_df["class"].apply(
|
182 |
+
lambda x: df["class"].tolist()[x]
|
183 |
+
)
|
184 |
+
|
185 |
+
if figsize is None:
|
186 |
+
figsize = (10, 7)
|
187 |
+
|
188 |
+
# Diplay the graph
|
189 |
+
if ax is None:
|
190 |
+
fig, ax = plt.subplots(1, 1, figsize=(10, 7))
|
191 |
+
|
192 |
+
sns.barplot(x="class", y="score", hue="metric", data=unstacked_df, ax=ax)
|
193 |
+
|
194 |
+
# Format the graph
|
195 |
+
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
|
196 |
+
if title is not None:
|
197 |
+
ax.set_title(title, fontsize=20)
|
198 |
+
|
199 |
+
if ylim is not None:
|
200 |
+
ax.set_ylim(ylim)
|
201 |
+
|
202 |
+
plt.tight_layout()
|
203 |
+
|
204 |
+
|
205 |
+
class BertTransformer(BaseEstimator, TransformerMixin):
|
206 |
+
"""See https://towardsdatascience.com/build-a-bert-sci-kit-transformer-59d60ddd54a5#d608"""
|
207 |
+
def __init__(
|
208 |
+
self,
|
209 |
+
bert_tokenizer,
|
210 |
+
bert_model,
|
211 |
+
max_length: int = 60,
|
212 |
+
embedding_func: Optional[Callable[[Tuple[torch.tensor]], torch.tensor]] = None,
|
213 |
+
):
|
214 |
+
self.tokenizer = bert_tokenizer
|
215 |
+
self.model = bert_model
|
216 |
+
self.model.eval()
|
217 |
+
self.max_length = max_length
|
218 |
+
self.embedding_func = embedding_func
|
219 |
+
|
220 |
+
if self.embedding_func is None:
|
221 |
+
self.embedding_func = lambda x: x[0][:, 0, :]
|
222 |
+
|
223 |
+
# TODO:: PADDING
|
224 |
+
|
225 |
+
def _tokenize(self, text: str):
|
226 |
+
tokenized_text = self.tokenizer.encode_plus(
|
227 |
+
text, add_special_tokens=True, max_length=self.max_length
|
228 |
+
)["input_ids"]
|
229 |
+
attention_mask = [1] * len(tokenized_text)
|
230 |
+
|
231 |
+
# bert takes in a batch so we need to unsqueeze the rows
|
232 |
+
return (
|
233 |
+
torch.tensor(tokenized_text).unsqueeze(0),
|
234 |
+
torch.tensor(attention_mask).unsqueeze(0),
|
235 |
+
)
|
236 |
+
|
237 |
+
def _tokenize_and_predict(self, text: str):
|
238 |
+
tokenized, attention_mask = self._tokenize(text)
|
239 |
+
|
240 |
+
embeddings = self.model(tokenized, attention_mask)
|
241 |
+
return self.embedding_func(embeddings)
|
242 |
+
|
243 |
+
def transform(self, text: List[str]):
|
244 |
+
if isinstance(text, pd.Series):
|
245 |
+
text = text.tolist()
|
246 |
+
|
247 |
+
with torch.no_grad():
|
248 |
+
return torch.stack([self._tokenize_and_predict(string) for string in text])
|
249 |
+
|
250 |
+
def fit(self, X, y=None):
|
251 |
+
"""No fitting necessary so we just return ourselves"""
|
252 |
+
return self
|
253 |
+
|
254 |
+
|
255 |
+
def convert_df_to_conv_ai_dict(df: pd.DataFrame,
|
256 |
+
personality: List[str],
|
257 |
+
response_columns: List[str],
|
258 |
+
tokenizer: Callable[[str], List[str]],
|
259 |
+
max_tokens: Optional[int] = None,
|
260 |
+
n_candidates: int = 6
|
261 |
+
) -> Dict[str, List[Any]]:
|
262 |
+
"""
|
263 |
+
Each entry in personachat is a dict with two keys personality and utterances, the dataset is a list of entries.
|
264 |
+
personality: list of strings containing the personality of the agent
|
265 |
+
utterances: list of dictionaries, each of which has two keys which are lists of strings.
|
266 |
+
candidates: [next_utterance_candidate_1, ..., next_utterance_candidate_19]
|
267 |
+
The last candidate is the ground truth response observed in the conversational data
|
268 |
+
history: [dialog_turn_0, ... dialog_turn N], where N is an odd number since the other user starts every conversation.
|
269 |
+
Preprocessing:
|
270 |
+
- Spaces before periods at end of sentences
|
271 |
+
- everything lowercase
|
272 |
+
|
273 |
+
Process each row of a DataFrame. For each row:
|
274 |
+
1. Grab the conversational input text
|
275 |
+
2. Grab A the responses
|
276 |
+
3. Create a unique data entry for each response to the question.
|
277 |
+
4. Sample random response sentences from the dataset.
|
278 |
+
5. Combine the random responses into a candidate list.
|
279 |
+
|
280 |
+
Args:
|
281 |
+
df: The counsel chat pandas dataframe
|
282 |
+
personality: The personality we would like to use during training
|
283 |
+
response_columns: Columns which contain valid responses to the question. For example,
|
284 |
+
the answerText column is the complete response of the therapist
|
285 |
+
tokenizer: The transformers library tokenizer associated with the model we will be
|
286 |
+
training. It is used for setting the maximum sequence length
|
287 |
+
max_tokens: The maximum number of tokens that any candidate, response, or question should be.
|
288 |
+
n_candidates: The number of candidate phrases to include in the dataset for training.
|
289 |
+
The last member of candidates is the ground truth response
|
290 |
+
|
291 |
+
Returns:
|
292 |
+
A dictionary with a train and validation key.
|
293 |
+
"""
|
294 |
+
# Add one because the index of the dataframe is the 0th position.
|
295 |
+
tuple_map = {name: index + 1 for index, name in enumerate(df.columns.tolist())}
|
296 |
+
|
297 |
+
train = []
|
298 |
+
val = []
|
299 |
+
# Step through every row in the dictionary
|
300 |
+
for row in df.itertuples():
|
301 |
+
|
302 |
+
# Get the question name and title
|
303 |
+
# TODO:: MAKE THIS GENERAL YOU DUMB DUMB
|
304 |
+
question_title = row[tuple_map["questionTitle"]]
|
305 |
+
question_text = row[tuple_map["questionText"]]
|
306 |
+
question_combined = question_title + " " + question_text
|
307 |
+
|
308 |
+
# Step through every response column in the row
|
309 |
+
for response_column in response_columns:
|
310 |
+
|
311 |
+
# Get the true response
|
312 |
+
true_response = row[tuple_map[response_column]]
|
313 |
+
|
314 |
+
# We only want to add data if a good response exists
|
315 |
+
if len(true_response) > 1:
|
316 |
+
# Get candidate alternate sentances by sampling from all other questions
|
317 |
+
candidates = sample_candidates(df, row[tuple_map["questionID"]], "questionID", "answerText",
|
318 |
+
n_candidates)
|
319 |
+
|
320 |
+
# Add the correct response to the end
|
321 |
+
candidates.append(true_response)
|
322 |
+
|
323 |
+
# We want to trim the size of the tokens
|
324 |
+
if max_tokens is not None:
|
325 |
+
# Use the provided tokenizer to tokenize the input and truncate at max_tokens
|
326 |
+
question_combined = tokenizer.convert_tokens_to_string(
|
327 |
+
tokenizer.tokenize(question_combined)[:max_tokens])
|
328 |
+
candidates = [tokenizer.convert_tokens_to_string(tokenizer.tokenize(candidate)[:max_tokens]) for
|
329 |
+
candidate in candidates]
|
330 |
+
|
331 |
+
if len(candidates) != n_candidates + 1:
|
332 |
+
print(true_response)
|
333 |
+
assert False
|
334 |
+
|
335 |
+
# Define the personality and the history
|
336 |
+
d = {"personality": personality,
|
337 |
+
"utterances": [{"history": [question_combined],
|
338 |
+
"candidates": candidates}]}
|
339 |
+
if getattr(row, "split") == "train":
|
340 |
+
train.append(d)
|
341 |
+
elif getattr(row, "split") == "val":
|
342 |
+
val.append(d)
|
343 |
+
|
344 |
+
data = {"train": train, "valid": val}
|
345 |
+
|
346 |
+
return data
|
347 |
+
|
348 |
+
|
349 |
+
def sample_candidates(df: pd.DataFrame, current_id: Any, id_column: str, text_column: str, n: int) -> List[str]:
|
350 |
+
"""Samples candidate responses to a question from the dataframe
|
351 |
+
|
352 |
+
It is aware of data splits and only samples from within the same split. This avoids
|
353 |
+
leaking information between training validation and testing. The sampled responses are
|
354 |
+
also drawn from all rows which do not have the same id as the current_id
|
355 |
+
|
356 |
+
Args:
|
357 |
+
df: The dataframe we want to sample responses from
|
358 |
+
current_id: The unique identifier we would like to leave out of our sampling
|
359 |
+
id_column: The column name in the dataframe with the unique ids. current_id should
|
360 |
+
be an element of this column
|
361 |
+
text_column: The column with the text we want to sample
|
362 |
+
n: How many samples we want to take.
|
363 |
+
|
364 |
+
Returns:
|
365 |
+
A list of samples strings from our dataframe.
|
366 |
+
"""
|
367 |
+
# We must only sample candidates from the correct data split to avoid information leakage across channels
|
368 |
+
split = df[df[id_column] == current_id]["split"].tolist()[0]
|
369 |
+
candidate_df = df[df["split"] == split]
|
370 |
+
|
371 |
+
# Sample 3 random rows from the dataframe not matching the current id
|
372 |
+
sampled_texts = candidate_df[candidate_df[id_column] != current_id].sample(n + 15)[text_column].tolist()
|
373 |
+
|
374 |
+
# join them all
|
375 |
+
text = " ".join(sampled_texts)
|
376 |
+
|
377 |
+
# Replace all newlines with spaces...
|
378 |
+
text_no_newline = re.sub("\n", " ", text).lower()
|
379 |
+
|
380 |
+
# Split on punctuation
|
381 |
+
split_text = re.split('[?.!]', text_no_newline)
|
382 |
+
|
383 |
+
# Remove all empty lines
|
384 |
+
filtered_text = [x.strip() for x in split_text if len(x.strip()) > 1]
|
385 |
+
|
386 |
+
# Shuffle the list
|
387 |
+
return np.random.choice(filtered_text, n).tolist()
|
counsel-chat-master/utils_test.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing imprt List
|
2 |
+
import unittest
|
3 |
+
|
4 |
+
import ddt
|
5 |
+
from ddt import unpack, data
|
6 |
+
import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
from transformers import OpenAIGPTTokenizer
|
9 |
+
|
10 |
+
from utils import sample_candidates, convert_df_to_conv_ai_dict
|
11 |
+
|
12 |
+
|
13 |
+
# fmt: off
|
14 |
+
class UtilsTest(unittest.TestCase):
|
15 |
+
|
16 |
+
def test_sample_candidates(self):
|
17 |
+
# Test that split id doesn't show up
|
18 |
+
# Create random DF
|
19 |
+
df = pd.DataFrame(data, columns=["questionID", "answerText", 'split'])
|
20 |
+
|
21 |
+
for i in range(5):
|
22 |
+
candidates = sample_candidates(df, , "questionID", "answerText", n_candidates)
|
23 |
+
# Check that the samples don't come from the true data
|
24 |
+
|
25 |
+
def test_fuzz_convert_df_to_conv_ai_dict(self):
|
26 |
+
df = pd.read_csv("data/20200325_counsel_chat.csv")
|
27 |
+
df = df[df["split"] == "train"]
|
28 |
+
tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
|
29 |
+
for i in range(5):
|
30 |
+
temp_df = df.sample(100)
|
31 |
+
max_tokens = np.random.randint(1, 200)
|
32 |
+
n_candidates = np.random.randint(1, 10)
|
33 |
+
d = convert_df_to_conv_ai_dict(temp_df,
|
34 |
+
[""],
|
35 |
+
["answerText"],
|
36 |
+
tokenizer,
|
37 |
+
max_tokens=max_tokens,
|
38 |
+
n_candidates=n_candidates)
|
39 |
+
|
40 |
+
# Test max length
|
41 |
+
self.assertLessEqual(max([len(x["utterances"][0]["history"][0].split()) for x in d["train"]]), max_tokens)
|
42 |
+
|
43 |
+
# Test n_candidates is equal to the number in the candidates list plus the one true response.
|
44 |
+
train_lengths = [len(x["utterances"][0]["candidates"]) for x in d["train"]]
|
45 |
+
self.assertEqual(n_candidates + 1, max(train_lengths))
|
46 |
+
self.assertEqual(n_candidates + 1, min(train_lengths))
|
47 |
+
|
48 |
+
|
49 |
+
if __name__ == "__main__":
|
50 |
+
unittest.main()
|
counselchat-convai/added_tokens.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"<bos>": 40478, "<eos>": 40479, "<pad>": 40480, "<speaker1>": 40481, "<speaker2>": 40482}
|
counselchat-convai/config.json
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"afn": "gelu",
|
3 |
+
"architectures": [
|
4 |
+
"OpenAIGPTLMHeadModel"
|
5 |
+
],
|
6 |
+
"attn_pdrop": 0.1,
|
7 |
+
"bos_token_id": null,
|
8 |
+
"do_sample": false,
|
9 |
+
"embd_pdrop": 0.1,
|
10 |
+
"eos_token_ids": null,
|
11 |
+
"finetuning_task": null,
|
12 |
+
"id2label": {
|
13 |
+
"0": "LABEL_0",
|
14 |
+
"1": "LABEL_1"
|
15 |
+
},
|
16 |
+
"initializer_range": 0.02,
|
17 |
+
"is_decoder": false,
|
18 |
+
"label2id": {
|
19 |
+
"LABEL_0": 0,
|
20 |
+
"LABEL_1": 1
|
21 |
+
},
|
22 |
+
"layer_norm_epsilon": 1e-05,
|
23 |
+
"length_penalty": 1.0,
|
24 |
+
"max_length": 20,
|
25 |
+
"model_type": "openai-gpt",
|
26 |
+
"n_ctx": 512,
|
27 |
+
"n_embd": 768,
|
28 |
+
"n_head": 12,
|
29 |
+
"n_layer": 12,
|
30 |
+
"n_positions": 512,
|
31 |
+
"n_special": 0,
|
32 |
+
"num_beams": 1,
|
33 |
+
"num_labels": 1,
|
34 |
+
"num_return_sequences": 1,
|
35 |
+
"output_attentions": false,
|
36 |
+
"output_hidden_states": false,
|
37 |
+
"output_past": true,
|
38 |
+
"pad_token_id": null,
|
39 |
+
"predict_special_tokens": true,
|
40 |
+
"pruned_heads": {},
|
41 |
+
"repetition_penalty": 1.0,
|
42 |
+
"resid_pdrop": 0.1,
|
43 |
+
"summary_activation": null,
|
44 |
+
"summary_first_dropout": 0.1,
|
45 |
+
"summary_proj_to_labels": true,
|
46 |
+
"summary_type": "cls_index",
|
47 |
+
"summary_use_proj": true,
|
48 |
+
"temperature": 1.0,
|
49 |
+
"top_k": 50,
|
50 |
+
"top_p": 1.0,
|
51 |
+
"torchscript": false,
|
52 |
+
"use_bfloat16": false,
|
53 |
+
"vocab_size": 40483
|
54 |
+
}
|
counselchat-convai/events.out.tfevents.1586138049.3b21657f0ca9
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a2c720c161121fb6a7457b091cdfb68090b7a55ab9dc7090a4d93a9791077c5b
|
3 |
+
size 140048
|
counselchat-convai/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
counselchat-convai/model_training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:65010fa713a71eac503cd4522be309b75f2874212050c38a780c5c7d7b73dfa9
|
3 |
+
size 729
|
counselchat-convai/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd9ab879baa4977df5caba66f5b923729f5618aee99b7aa3f5fe244a9e59150d
|
3 |
+
size 478771216
|
counselchat-convai/special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bos_token": "<bos>", "eos_token": "<eos>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": ["<speaker1>", "<speaker2>"]}
|
counselchat-convai/tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"max_len": 512}
|
counselchat-convai/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|