Joblib
English
llm
human-feedback
weak supervision
data filtering
Inference Endpoints
Christopher Glaze commited on
Commit
b208d2e
1 Parent(s): 34c2f2c

Initial commit of model files.

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ **/__pycache__
handler.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import Dict, List, Any, Union, Optional
3
+ from pathlib import Path
4
+ import json
5
+ import joblib
6
+ import pandas as pd
7
+ import nltk
8
+ from transformers import AutoModel, AutoTokenizer
9
+ import torch
10
+ import numpy as np
11
+ from sklearn.base import TransformerMixin
12
+
13
+ class SimcseGenerator(TransformerMixin):
14
+ def __init__(
15
+ self, device: str ='cpu', batch_size: int =16, model_name: str = "princeton-nlp/unsup-simcse-bert-base-uncased"
16
+ ) -> None:
17
+
18
+ self.model_name = model_name
19
+ self.device = torch.device(device)
20
+
21
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
22
+ model = AutoModel.from_pretrained(model_name).to(self.device)
23
+
24
+ self.tokenizer = tokenizer
25
+ self.model = model
26
+ self.batch_size = batch_size
27
+
28
+ def transform(self, X: np.ndarray) -> np.ndarray:
29
+ batch_size = (
30
+ 16 # any larger, and we risk running out of memory on EC2 dev instances
31
+ )
32
+
33
+ embeddings = []
34
+
35
+ for start in range(0, len(X), batch_size):
36
+ end = min(len(X), start + batch_size)
37
+ inputs = self.tokenizer(
38
+ X[start:end],
39
+ padding=True,
40
+ truncation=True,
41
+ return_tensors="pt",
42
+ )
43
+ with torch.no_grad():
44
+ inputs = inputs.to(self.device)
45
+ batch_embeddings = self.model(
46
+ **inputs, output_hidden_states=True, return_dict=True
47
+ ).pooler_output
48
+ embeddings.append(batch_embeddings.cpu().detach().numpy())
49
+
50
+ embeddings = np.concatenate(embeddings)
51
+ embeddings /= np.sqrt(np.square(embeddings).sum(axis=1))[:,np.newaxis]
52
+
53
+ return embeddings
54
+
55
+ class EndpointHandler():
56
+ def __init__(self, device: str = "cpu"):
57
+ # Preload all the elements you are going to need at inference.
58
+ # pseudo:
59
+ # self.model= load_model(path)
60
+
61
+ local_path = Path(__file__).parent
62
+ self.device = device
63
+ with open(local_path/'stop_words.json','r') as fp:
64
+ self.stop_words = set(json.load(fp))
65
+
66
+ with open(local_path/'instruction_label_map.json','r') as fp:
67
+ self.instruction_label_map = json.load(fp)
68
+ self.instruction_label_map = {int(k):v for k,v in self.instruction_label_map.items()}
69
+
70
+ self.instruction_pipeline = joblib.load(local_path/'instruction_classification_pipeline.joblib')
71
+ self.response_pipeline = joblib.load(local_path/'response_quality_pipeline.joblib')
72
+
73
+ self.simcse_generator = SimcseGenerator(device=self.device)
74
+
75
+ def _get_stop_word_proportion(self, s):
76
+ s = s.lower()
77
+ try:
78
+ words = nltk.tokenize.word_tokenize(s)
79
+ except:
80
+ words = nltk.tokenize.word_tokenize(s[1:])
81
+
82
+ if len(words)==0:
83
+ return 0
84
+ else:
85
+ return sum(x in self.stop_words for x in words) / len(words)
86
+
87
+
88
+ def predict_instruction_classes(self, df: pd.DataFrame) -> np.ndarray:
89
+ instruction_classes = self.instruction_pipeline.predict(df)
90
+ instruction_class_confidence = self.instruction_pipeline.predict_proba(df).max(axis=1)
91
+ return np.array(list(map(lambda x: self.instruction_label_map[x], instruction_classes))), instruction_class_confidence
92
+
93
+ def compute_response_quality_feature_space(self, df: pd.DataFrame, instruction_classes: Optional[np.ndarray] = None):
94
+
95
+ if instruction_classes is None:
96
+ instruction_classes, _ = self.predict_instruction_classes(df)
97
+
98
+ instruction_class_set = [self.instruction_label_map[i] for i in range(len(self.instruction_label_map))]
99
+
100
+ instruction_classes_onehot = pd.DataFrame(instruction_classes[:,np.newaxis]==np.array(instruction_class_set)[np.newaxis,:], columns=instruction_class_set).astype(float)
101
+
102
+ df1 = pd.concat([df,instruction_classes_onehot], axis=1)
103
+
104
+ df1['instruction_response_similarity'] = (self.simcse_generator.transform(df['instruction'].tolist()) * self.simcse_generator.transform(df['response'].tolist())).sum(axis=1)
105
+
106
+ df1['token_number'] = df1['response'].str.split().apply(len)
107
+ df1['stop_word_proportion'] = df1['response'].apply(self._get_stop_word_proportion)
108
+
109
+ return df1
110
+
111
+ def predict_response_quality(self, df, instruction_classes):
112
+ df1 = self.compute_response_quality_feature_space(df, instruction_classes)
113
+ return self.response_pipeline.predict_proba(df1)[:,1]
114
+
115
+
116
+ def __call__(self, df: Union[pd.DataFrame, Dict]):
117
+
118
+ is_dict = isinstance(df, dict)
119
+
120
+ if is_dict:
121
+ df = pd.DataFrame([df])
122
+
123
+ if 'dataset' not in df.columns:
124
+ df['dataset'] = ''
125
+
126
+ instruction_classes, instruction_class_confidences = self.predict_instruction_classes(df)
127
+
128
+ predictions = [{'instruction class': instruction_class, 'instruction class confidence': instruction_class_confidence} for instruction_class, instruction_class_confidence in zip(instruction_classes, instruction_class_confidences)]
129
+
130
+ if 'response' in df.columns:
131
+ response_qualities = self.predict_response_quality(df, instruction_classes)
132
+ for i,response_quality in enumerate(response_qualities):
133
+ predictions[i].update({'response quality': response_quality})
134
+
135
+ if is_dict:
136
+ return predictions[0]
137
+ else:
138
+ return pd.DataFrame(predictions, index=df.index)
instruction_classification_pipeline.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bba5f13a13cdd94cb631f160c9e09ef3c93c6ead7622ec4b0b960a17ff7487c
3
+ size 21068584
instruction_label_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"0": "open-qa", "1": "closed-qa", "2": "summarization", "3": "generation", "4": "brainstorming", "5": "other"}
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ numpy==1.24.2
2
+ pandas==1.5.3
3
+ torch==2.0.1
4
+ transformers==4.30.2
5
+ nltk==3.8.1
6
+ scikit-learn==1.3.0
7
+ xgboost==1.7.6
response_quality_pipeline.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a116abfb535671bb07da6ca5f211f671f2aeb856b093a8a8d5c62df17d4ed5a3
3
+ size 37881144
stop_words.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["y", "own", "here", "hadn", "our", "s", "yours", "you'll", "those", "having", "between", "ain", "each", "haven't", "needn", "under", "can", "before", "but", "because", "me", "hasn't", "against", "then", "only", "at", "just", "weren", "of", "above", "as", "further", "myself", "you", "some", "yourself", "from", "out", "shouldn", "while", "be", "wouldn", "into", "her", "mightn't", "their", "m", "same", "any", "wasn", "if", "who", "for", "t", "shan't", "mightn", "have", "i", "he", "a", "are", "does", "over", "until", "its", "my", "couldn't", "you'd", "to", "not", "ma", "whom", "won't", "the", "being", "shouldn't", "ourselves", "isn", "about", "did", "shan", "after", "didn't", "them", "once", "all", "too", "she's", "or", "in", "on", "am", "mustn't", "than", "that", "few", "that'll", "ve", "with", "couldn", "itself", "down", "it's", "where", "such", "isn't", "didn", "again", "will", "when", "through", "him", "mustn", "won", "doesn", "o", "herself", "weren't", "themselves", "which", "don", "wasn't", "below", "most", "re", "what", "your", "ours", "you're", "she", "his", "himself", "up", "yourselves", "should've", "this", "is", "you've", "d", "very", "these", "more", "off", "doing", "hers", "and", "both", "how", "aren't", "do", "needn't", "it", "don't", "no", "haven", "were", "by", "hasn", "during", "wouldn't", "aren", "had", "should", "has", "been", "other", "ll", "theirs", "why", "so", "doesn't", "an", "hadn't", "nor", "was", "there", "now", "they", "we"]
tests.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from handler import EndpointHandler
2
+
3
+ # init handler
4
+ response_model_handler = EndpointHandler()
5
+
6
+ # prepare sample payload
7
+ payload = {"instruction": "What are some ways to stay energized throughout the day?",
8
+ "response": "Drink lots of coffee!"}
9
+
10
+ # test the handler
11
+ pred=response_model_handler(payload)
12
+
13
+ print(pred)