-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtesting.py
More file actions
265 lines (205 loc) · 11.9 KB
/
testing.py
File metadata and controls
265 lines (205 loc) · 11.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import torch
import numpy as np
from training import corrcoef
from training import RMSELoss
from torch.nn import L1Loss
from torch.nn import MSELoss
from sklearn.metrics.pairwise import euclidean_distances
from training import pad_batch_online
def isnotebook():
try:
shell = get_ipython().__class__.__name__
if shell == 'ZMQInteractiveShell':
return True # Jupyter notebook or qtconsole
elif shell == 'TerminalInteractiveShell':
return False # Terminal running IPython
else:
return False # Other type (?)
except NameError:
return False # Probably standard Python interpreter
if isnotebook():
from tqdm.notebook import tqdm
else:
from tqdm import tqdm
tqdm.pandas()
rmse = RMSELoss(eps=0)
l1 = L1Loss()
l2 = MSELoss()
class Testing:
"""
Create Testing Instance
:param model: torch model
model to train
:param inps: pd.Series
series containing inputs
:param tgts: pd.Series
series containing the corresponding targets
:param criterion: torch.Loss
criterion to calculate the loss between targets and predictions
- if using cross correlation the function must accept predicted correlatios vs. true correlation and predicted vectors vs. target vectors
:param use_cross_corr: bool
specify whether to calculate the cross correlation
(necessary for calculating cross correlation)
:param labels: pd.Series
series containing the label for each input sample
:param cross_corr_matrix: pd.DataFrame
pd.DataFrame containing the pairwise correlation between all unique labels
- labels are used and row and column index
:param label_vectors: pd.DataFrame
pd.DataFrame containing the semantic vectors for each label
- labels are used as row index
"""
def __init__(self, model, inps, tgts, criterion,
with_onset_dim=False,
with_time_dim=False,
use_cross_corr = False,
labels = [],
label_vectors=None,
cross_corr_matrix = None):
self.model = model
self.inps = inps
self.tgts = tgts
self.labels = labels
self.predictions = None
self.losses = None
self.sublosses = None
self.device = next(model.parameters()).device # get device model is located at
self.lens_input = torch.Tensor(np.array(inps.apply(len), dtype=np.int)).to(self.device)
self.lens_output = torch.Tensor(np.array(tgts.apply(len), dtype=np.int)).to(self.device)
self.criterion = criterion
self.use_cross_corr = use_cross_corr
self.with_onset_dim = with_onset_dim
self.with_time_dim = with_time_dim
# necessary for predicting labels
self.label_vectors = label_vectors
if not label_vectors is None:
self.label_vectors_np = np.array(list(label_vectors.vector))
# predicting labbels
self.top_10_predicted_labels_euclidean = None # top 10 predicted labels reffering to the distance to the prediction
self.top_10_predicted_labels_euclidean_distance = None # distance of top 10 predicted labels to the prediction
self.euclidean_dist_to_true_labels = None # euclidean distance of true label to the prediction
self.rank_of_true_label_in_predictions_euclidean = None # the rank of the true label in the distance to the prediction
self.top_10_predicted_labels_cross_correlation = None
self.top_10_predicted_labels_cross_correlation_correlation = None
self.cross_correlation_with_true_labels = None
self.rank_of_true_label_in_predictions_cross_correlation = None
if use_cross_corr:
assert len(labels) > 0, "In order to use cross correlation please provide labels!"
assert not cross_corr_matrix is None, "Please provide a precomputed Cross Correlation Matrix!"
assert not label_vectors is None, "Please provide a lookup df with labels and corresponding embedding vectors!"
self.cross_corr_matrix = cross_corr_matrix
self.label_vectors_torch = torch.from_numpy(np.asarray(list(label_vectors.vector))).to(self.device)
def score(self):
"""
Function for scoring model on inputs
:return :
-
- predictions made by the model for each sample are stored in self.predictions
- the loss/subloss for each prediction with respect to the provided criterion is stored in self.losses/self.sub_losses
"""
test_predictions = []
test_losses = []
test_sublosses = []
with torch.no_grad(): # no gradient calculation
for idxs in tqdm(range(len(self.inps)), desc="Predicting..."):
#lens_input_jj = [self.lens_input[idxs]]
lens_input_jj = self.lens_input[idxs:(idxs+1)]
batch_input = self.inps.iloc[idxs:(idxs+1)]
batch_input = pad_batch_online(lens_input_jj, batch_input, self.device,self.with_onset_dim, self.with_time_dim)
#lens_output_jj = [self.lens_output[idxs]]
#batch_input = torch.tensor(list([self.inps.iloc[idxs]]), device = self.device)
Y_hat = self.model(batch_input,lens_input_jj)
batch_output = torch.tensor(list([self.tgts.iloc[idxs]]), device=self.device)
if self.use_cross_corr:
batch_output_cross_corr = self.labels.iloc[idxs]
batch_output_cross_corr = torch.from_numpy(np.asarray([self.cross_corr_matrix.loc[batch_output_cross_corr]])).to(self.device)
Y_hat_cross_corr = corrcoef(torch.cat((Y_hat, self.label_vectors_torch)))[:1,1:]
loss = self.criterion(Y_hat_cross_corr, batch_output_cross_corr, Y_hat, batch_output)
else:
loss = self.criterion(Y_hat, batch_output)
if isinstance(loss, tuple):
sub_losses = loss[1:]
loss = loss[0]
test_losses += [loss.item()]
test_sublosses += [[sub_loss.item() for sub_loss in sub_losses]] # for each sample [subloss1_i,subloss2_i,subloss3_i]
else:
test_losses += [loss.item()]
prediction = Y_hat.cpu().detach().numpy()[0]
test_predictions +=[prediction]
self.predictions = test_predictions
self.losses = test_losses
if len(test_sublosses) > 0:
test_sublosses = np.asarray(test_sublosses)
self.sublosses = [test_sublosses[:, i] for i in range(test_sublosses.shape[1])] # for each subloss [subloss1_i, subloss1_j,subloss1_k]
def predict_top10_labels(self,prediction, true_label, metric):
"""
:param prediction: np.array
model prediciton of semantic vector
:param true_label: str
true label name
:param metric: str
metric to predict labels (one of: "cross-correlation" or "euclidean")
:return label_pred, dist[top_10_closest], dist_of_true_label, rank_of_true_label_in_dist: np.array, np.array, float, int
- np.array of labels for top 10 closest vectors to predicted vector
- np.array with corresponding distances of these top 10 vectors to prediction
- distance of true vector to predicted
- rank of true vector with respect to the distance to the predicted vector
"""
if metric == "cross-correlation":
prediction = torch.from_numpy(np.asarray([prediction])).to(self.device)
cross_corr = np.array(corrcoef(torch.cat((prediction, self.label_vectors_torch)))[:1, 1:].cpu())[0] # cross correlation of prediction with all others
true_index = int(self.label_vectors[self.label_vectors.label == true_label].index[0]) # index of true label
cross_corr_of_true_label = cross_corr[true_index] # true cross correlation
cross_corr_argsort = np.argsort(cross_corr)[::-1] # sort descending (highes correlation)
top_10_cross_corr = cross_corr_argsort[:10] # top 10 correlation indices
rank_of_true_label_in_cross_corr = np.where(cross_corr_argsort == true_index)[0][0] + 1 # check true index in sorted correlation
label_pred = np.array(self.label_vectors.loc[top_10_cross_corr].label) # labels with highest correlations
return label_pred, cross_corr[top_10_cross_corr], cross_corr_of_true_label, rank_of_true_label_in_cross_corr
elif metric == "euclidean":
prediction = np.asarray([prediction])
dist = euclidean_distances(prediction, self.label_vectors_np)[0]
true_index = int(self.label_vectors[self.label_vectors.label == true_label].index[0])
dist_of_true_label = dist[true_index]
dist_argsort = np.argsort(dist)
top_10_closest = dist_argsort[:10]
rank_of_true_label_in_dist = np.where(dist_argsort == true_index)[0][0] + 1
label_pred = np.array(self.label_vectors.loc[top_10_closest].label)
return label_pred, dist[top_10_closest], dist_of_true_label, rank_of_true_label_in_dist
def predict_labels(self, metric):
"""
:param metric: str
- metric to get labels from predicted semantic vectors
:return:
sets self.top_10_predicted_labels_...
self.top_10_predicted_labels_..
self..._to_true_labels
self.rank_of_true_label_in_predictions_...
by using provided metric
"""
assert not self.predictions is None, "In order to predict labels please call the score function first!"
assert metric in ["euclidean", "cross-correlation"], "only euclidean distance and cross-correlation are implemnted to get the label of the closest target vector form the prediction!"
assert not self.label_vectors is None, "In order to compare the predicted vectors to the true vectors, please provide a lookup df with labels and corresponding embedding vectors!"
assert len(self.labels) > 0, "In order to compare the prediction to the true label, true labels must be provided!"
top_10_predicted_labels = []
top_10_predicted_labels_distance = []
dist_to_true_labels = []
rank_of_true_label_in_predictions = []
for i, prediction in enumerate(tqdm(self.predictions, desc="Predicting Labels...")):
true_label = self.labels.iloc[i]
top10, top10_distance, true_label_distance, true_label_rank = self.predict_top10_labels(prediction,
true_label,
metric=metric)
top_10_predicted_labels.append(top10)
top_10_predicted_labels_distance.append(top10_distance)
dist_to_true_labels.append(true_label_distance)
rank_of_true_label_in_predictions.append(true_label_rank)
if metric == "cross-correlation":
self.top_10_predicted_labels_cross_correlation = top_10_predicted_labels
self.top_10_predicted_labels_cross_correlation_correlation = top_10_predicted_labels_distance
self.cross_correlation_with_true_labels = dist_to_true_labels
self.rank_of_true_label_in_predictions_cross_correlation = rank_of_true_label_in_predictions
else:
self.top_10_predicted_labels_euclidean = top_10_predicted_labels
self.top_10_predicted_labels_euclidean_distance = top_10_predicted_labels_distance
self.euclidean_dist_to_true_labels = dist_to_true_labels
self.rank_of_true_label_in_predictions_euclidean = rank_of_true_label_in_predictions