From 145e131366f4b48fdd095583647fd7ebc94c0473 Mon Sep 17 00:00:00 2001 From: yangarbiter Date: Fri, 24 Feb 2017 11:08:26 +0800 Subject: [PATCH 1/4] Add multilabel QUIRE --- .../query_strategies/multilabel/__init__.py | 1 + .../multilabel/multilabel_quire.py | 146 ++++++++++++++++++ 2 files changed, 147 insertions(+) create mode 100644 libact/query_strategies/multilabel/multilabel_quire.py diff --git a/libact/query_strategies/multilabel/__init__.py b/libact/query_strategies/multilabel/__init__.py index f0966ff4..a8d84140 100644 --- a/libact/query_strategies/multilabel/__init__.py +++ b/libact/query_strategies/multilabel/__init__.py @@ -6,3 +6,4 @@ from .maximum_margin_reduction import MaximumLossReductionMaximalConfidence as MMC from .multilable_with_auxiliary_learner import MultilabelWithAuxiliaryLearner from .binary_minimization import BinaryMinimization +from .multilabel_quire import MultilabelQUIRE diff --git a/libact/query_strategies/multilabel/multilabel_quire.py b/libact/query_strategies/multilabel/multilabel_quire.py new file mode 100644 index 00000000..b83aa85a --- /dev/null +++ b/libact/query_strategies/multilabel/multilabel_quire.py @@ -0,0 +1,146 @@ +"""Multi-label Active Learning with Auxiliary Learner +""" +import copy + +import numpy as np +from sklearn.svm import SVC +from sklearn.metrics.pairwise import linear_kernel, polynomial_kernel,\ + rbf_kernel + +from libact.base.dataset import Dataset +from libact.base.interfaces import QueryStrategy, ContinuousModel +from libact.utils import inherit_docstring_from, seed_random_state, zip +from libact.models import LogisticRegression, SVM +from libact.models.multilabel import BinaryRelevance, DummyClf + + +class MultilabelQUIRE(QueryStrategy): + r"""Multi-label Querying Informative and Representative Examples + + Parameters + ---------- + lamba : float, optional default: 1. + Regularization term. + + kernel : {'linear', 'poly', 'rbf', callable}, optional (default='rbf') + Specifies the kernel type to be used in the algorithm. + It must be one of 'linear', 'poly', 'rbf', or a callable. + If a callable is given it is used to pre-compute the kernel matrix + from data matrices; that matrix should be an array of shape + ``(n_samples, n_samples)``. + + degree : int, optional (default=3) + Degree of the polynomial kernel function ('poly'). + Ignored by all other kernels. + + gamma : float, optional (default=1.) + Kernel coefficient for 'rbf', 'poly'. + + coef0 : float, optional (default=1.) + Independent term in kernel function. + It is only significant in 'poly'. + + random_state : {int, np.random.RandomState instance, None}, optional (default=None) + If int or None, random_state is passed as parameter to generate + np.random.RandomState instance. if np.random.RandomState instance, + random_state is the random number generate. + + Attributes + ---------- + + Examples + -------- + Here is an example of declaring a multilabel with auxiliary learner + query_strategy object: + + .. code-block:: python + + from libact.query_strategies.multilabel import MultilabelWithAuxiliaryLearner + from libact.models.multilabel import BinaryRelevance + from libact.models import LogisticRegression, SVM + + qs = MultilabelWithAuxiliaryLearner( + dataset, + major_learner=BinaryRelevance(LogisticRegression()) + auxiliary_learner=BinaryRelevance(SVM()) + ) + + References + ---------- + .. [1] Huang, S. J., R. Jin, and Z. H. Zhou. "Active Learning by Querying + Informative and Representative Examples." IEEE transactions on + pattern analysis and machine intelligence 36.10 (2014): 1936. + """ + + def __init__(self, dataset, lamba=1.0, kernel='rbf', gamma=1., coef0=1., + degree=3, random_state=None): + super(MultilabelQUIRE, self).__init__(dataset) + + self.lamba = lamba + + X, _ = zip(*dataset.get_entries()) + self.kernel = kernel + if self.kernel == 'rbf': + self.K = rbf_kernel(X=X, Y=X, gamma=kwargs.pop('gamma', 1.)) + elif self.kernel == 'poly': + self.K = polynomial_kernel(X=X, + Y=X, + coef0=kwargs.pop('coef0', 1), + degree=kwargs.pop('degree', 3), + gamma=kwargs.pop('gamma', 1.)) + elif self.kernel == 'linear': + self.K = linear_kernel(X=X, Y=X) + elif hasattr(self.kernel, '__call__'): + self.K = self.kernel(X=np.array(X), Y=np.array(X)) + else: + raise NotImplementedError + + self.random_state_ = seed_random_state(random_state) + + @inherit_docstring_from(QueryStrategy) + def make_query(self): + dataset = self.dataset + X, Y = zip(*dataset.get_entries()) + _, lbled_Y = zip(*dataset.get_labeled_entries()) + + X = np.array(X) + K = self.K + n = len(X) + m = np.shape(lbled_Y)[1] + lamba = self.lamba + + # index for labeled and unlabeled instance + l = np.array([i for i in range(len(Y)) if Y[i] is not None]) + l = np.tile(l, m) + u = np.array([i for i in range(len(Y)) if Y[i] is None]) + u = np.tile(u, m) + + # label correlation matrix + R = np.corrcoef(np.array(lbled_Y).T) + R = np.nan_to_num(R) + + L = lamba * (np.linalg.pinv(np.kron(R, K) + lamba * np.eye(n*m))) + inv_L = np.linalg.pinv(L) + + vecY = np.reshape(np.array([y for y in Y if y is not None]), (-1, 1)) + invLuu = np.linalg.pinv(L[np.ix_(u, u)]) + + score = np.zeros((n, m)) + for a in range(n): + for b in range(m): + s = b*n + a + U = np.dot(L[np.ix_(u, l)], vecY) + L[np.ix_(u, [s])] + temp1 = 2 * np.dot(L[[s], l], vecY) \ + - np.dot(np.dot(U.T, invLuu), U) + U = np.dot(L[np.ix_(u, l)], vecY) + temp0 = -(np.dot(np.dot(U.T, invLuu), U)) + score[a, b] = L[s, s] \ + + np.dot(np.dot(vecY.T, L[np.ix_(l, l)]), + vecY)[0, 0]\ + + np.max((temp1[0, 0], temp0[0, 0])) + + score = np.sum(score, axis=1) + + ask_id = self.random_state_.choice(np.where(score == np.min(score))[0]) + + return ask_id From a932ee948bea7b5c6766e3708a572d4db85be127 Mon Sep 17 00:00:00 2001 From: yangarbiter Date: Fri, 24 Feb 2017 14:50:45 +0800 Subject: [PATCH 2/4] multilabel QUIRE speed up a bit --- .../multilabel/multilabel_quire.py | 75 ++++++++++--------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/libact/query_strategies/multilabel/multilabel_quire.py b/libact/query_strategies/multilabel/multilabel_quire.py index b83aa85a..7dba8f60 100644 --- a/libact/query_strategies/multilabel/multilabel_quire.py +++ b/libact/query_strategies/multilabel/multilabel_quire.py @@ -97,6 +97,17 @@ def __init__(self, dataset, lamba=1.0, kernel='rbf', gamma=1., coef0=1., self.random_state_ = seed_random_state(random_state) + + _, lbled_Y = zip(*dataset.get_labeled_entries()) + n = len(X) + m = np.shape(lbled_Y)[1] + # label correlation matrix + R = np.corrcoef(np.array(lbled_Y).T) + R = np.nan_to_num(R) + + self.L = lamba * (np.linalg.pinv(np.kron(R, self.K) \ + + lamba * np.eye(n*m))) + @inherit_docstring_from(QueryStrategy) def make_query(self): dataset = self.dataset @@ -105,42 +116,36 @@ def make_query(self): X = np.array(X) K = self.K - n = len(X) + n_instance = len(X) m = np.shape(lbled_Y)[1] lamba = self.lamba # index for labeled and unlabeled instance - l = np.array([i for i in range(len(Y)) if Y[i] is not None]) - l = np.tile(l, m) - u = np.array([i for i in range(len(Y)) if Y[i] is None]) - u = np.tile(u, m) - - # label correlation matrix - R = np.corrcoef(np.array(lbled_Y).T) - R = np.nan_to_num(R) - - L = lamba * (np.linalg.pinv(np.kron(R, K) + lamba * np.eye(n*m))) - inv_L = np.linalg.pinv(L) - - vecY = np.reshape(np.array([y for y in Y if y is not None]), (-1, 1)) - invLuu = np.linalg.pinv(L[np.ix_(u, u)]) - - score = np.zeros((n, m)) - for a in range(n): - for b in range(m): - s = b*n + a - U = np.dot(L[np.ix_(u, l)], vecY) + L[np.ix_(u, [s])] - temp1 = 2 * np.dot(L[[s], l], vecY) \ - - np.dot(np.dot(U.T, invLuu), U) - U = np.dot(L[np.ix_(u, l)], vecY) - temp0 = -(np.dot(np.dot(U.T, invLuu), U)) - score[a, b] = L[s, s] \ - + np.dot(np.dot(vecY.T, L[np.ix_(l, l)]), - vecY)[0, 0]\ - + np.max((temp1[0, 0], temp0[0, 0])) - - score = np.sum(score, axis=1) - - ask_id = self.random_state_.choice(np.where(score == np.min(score))[0]) - - return ask_id + l_id = [] + a_id = [] + for i in range(n_instance * m): + if Y[i%n_instance] is None: + a_id.append(i) + else: + l_id.append(i) + + L = self.L + vecY = np.reshape(np.array([y for y in Y if y is not None]).T, (-1, 1)) + detLaa = np.linalg.det(L[np.ix_(a_id, a_id)]) + + score = [] + for i, s in enumerate(a_id): + u_id = a_id[:i] + a_id[i+1:] + invLuu = L[np.ix_(u_id, u_id)] \ + - 1./L[s, s] * np.dot(L[u_id, s], L[u_id, s].T) + score.append(L[s, s] - detLaa / L[s, s] \ + + 2 * np.abs(np.dot(L[np.ix_([s], l_id)] \ + - np.dot(np.dot(L[s, u_id], invLuu), + L[np.ix_(u_id, l_id)]), vecY))[0][0]) + + import ipdb; ipdb.set_trace() + score = np.sum(np.array(score).reshape(m, -1).T, axis=1) + + ask_idx = self.random_state_.choice(np.where(score == np.min(score))[0]) + + return a_id[ask_idx] From 65791a432ddad0a9130e8878188a84bfd25fbff8 Mon Sep 17 00:00:00 2001 From: yangarbiter Date: Wed, 1 Mar 2017 20:56:46 +0800 Subject: [PATCH 3/4] update multilabel QUIRE --- .../multilabel/multilabel_quire.py | 41 +++++++++++++++---- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/libact/query_strategies/multilabel/multilabel_quire.py b/libact/query_strategies/multilabel/multilabel_quire.py index 7dba8f60..e626e62f 100644 --- a/libact/query_strategies/multilabel/multilabel_quire.py +++ b/libact/query_strategies/multilabel/multilabel_quire.py @@ -104,9 +104,9 @@ def __init__(self, dataset, lamba=1.0, kernel='rbf', gamma=1., coef0=1., # label correlation matrix R = np.corrcoef(np.array(lbled_Y).T) R = np.nan_to_num(R) + self.RK = np.kron(R, self.K) - self.L = lamba * (np.linalg.pinv(np.kron(R, self.K) \ - + lamba * np.eye(n*m))) + self.L = lamba * (np.linalg.pinv(self.RK + lamba * np.eye(n*m))) @inherit_docstring_from(QueryStrategy) def make_query(self): @@ -115,7 +115,7 @@ def make_query(self): _, lbled_Y = zip(*dataset.get_labeled_entries()) X = np.array(X) - K = self.K + RK = self.RK n_instance = len(X) m = np.shape(lbled_Y)[1] lamba = self.lamba @@ -132,18 +132,41 @@ def make_query(self): L = self.L vecY = np.reshape(np.array([y for y in Y if y is not None]).T, (-1, 1)) detLaa = np.linalg.det(L[np.ix_(a_id, a_id)]) - + #invLaa = np.linalg.pinv(L[np.ix_(a_id, a_id)]) + invLaa = (lamba * np.eye(len(a_id)) + RK[np.ix_(a_id, a_id)]) \ + - np.dot(np.dot(RK[np.ix_(a_id, l_id)], + np.linalg.pinv(lamba * np.eye(len(l_id)) \ + + RK[np.ix_(l_id, l_id)])), + RK[np.ix_(l_id, a_id)]) + + b = np.zeros((len(a_id)-1)) score = [] + D = np.zeros((len(a_id)-1, len(a_id)-1)) + D[...] = invLaa[1:, 1:] for i, s in enumerate(a_id): + # L -> s, Laa -> i u_id = a_id[:i] + a_id[i+1:] - invLuu = L[np.ix_(u_id, u_id)] \ - - 1./L[s, s] * np.dot(L[u_id, s], L[u_id, s].T) + #D = np.delete(np.delete(invLaa, i, axis=0), i, axis=1) + if i > 0: + D[(i-1), :i] = invLaa[(i-1), :i] + D[(i-1), i:] = invLaa[(i-1), (i+1):] + D[:i, (i-1)] = invLaa[:i, (i-1)] + D[i:, (i-1)] = invLaa[(i+1):, (i-1)] + #D[:i, :i] = invLaa[:i, :i] + #D[i:, i:] = invLaa[i+1:, i+1:] + #D[:i, i:] = invLaa[:i, i+1:] + #D[i:, :i] = invLaa[i+1:, :i] + + #b = np.delete(invLaa, i, axis=0)[:, i] + b[:i] = invLaa[:i, i] + b[i:] = invLaa[i+1:, i] + invLuu = D - 1./invLaa[i, i] * np.dot(b, b.T) + score.append(L[s, s] - detLaa / L[s, s] \ - + 2 * np.abs(np.dot(L[np.ix_([s], l_id)] \ + + 2 * np.abs(np.dot(L[s, l_id] \ - np.dot(np.dot(L[s, u_id], invLuu), - L[np.ix_(u_id, l_id)]), vecY))[0][0]) + L[np.ix_(u_id, l_id)]), vecY))) - import ipdb; ipdb.set_trace() score = np.sum(np.array(score).reshape(m, -1).T, axis=1) ask_idx = self.random_state_.choice(np.where(score == np.min(score))[0]) From 60d55460678bdace16065ce5257eb2c0b1095631 Mon Sep 17 00:00:00 2001 From: yangarbiter Date: Thu, 2 Mar 2017 13:11:14 +0800 Subject: [PATCH 4/4] add test for multilabel quire --- .../multilabel/multilabel_quire.py | 83 ++++++++++++------- .../multilabel/tests/test_multilabel_quire.py | 26 ++++++ libact/utils/__init__.py | 30 +++++++ 3 files changed, 107 insertions(+), 32 deletions(-) create mode 100644 libact/query_strategies/multilabel/tests/test_multilabel_quire.py diff --git a/libact/query_strategies/multilabel/multilabel_quire.py b/libact/query_strategies/multilabel/multilabel_quire.py index e626e62f..5c2e76ff 100644 --- a/libact/query_strategies/multilabel/multilabel_quire.py +++ b/libact/query_strategies/multilabel/multilabel_quire.py @@ -81,13 +81,10 @@ def __init__(self, dataset, lamba=1.0, kernel='rbf', gamma=1., coef0=1., X, _ = zip(*dataset.get_entries()) self.kernel = kernel if self.kernel == 'rbf': - self.K = rbf_kernel(X=X, Y=X, gamma=kwargs.pop('gamma', 1.)) + self.K = rbf_kernel(X=X, Y=X, gamma=gamma) elif self.kernel == 'poly': - self.K = polynomial_kernel(X=X, - Y=X, - coef0=kwargs.pop('coef0', 1), - degree=kwargs.pop('degree', 3), - gamma=kwargs.pop('gamma', 1.)) + self.K = polynomial_kernel(X=X, Y=X, coef0=coef0, degree=degree, + gamma=gamma) elif self.kernel == 'linear': self.K = linear_kernel(X=X, Y=X) elif hasattr(self.kernel, '__call__'): @@ -99,8 +96,9 @@ def __init__(self, dataset, lamba=1.0, kernel='rbf', gamma=1., coef0=1., _, lbled_Y = zip(*dataset.get_labeled_entries()) + self.n_labels = np.shape(lbled_Y)[1] n = len(X) - m = np.shape(lbled_Y)[1] + m = self.n_labels # label correlation matrix R = np.corrcoef(np.array(lbled_Y).T) R = np.nan_to_num(R) @@ -108,39 +106,60 @@ def __init__(self, dataset, lamba=1.0, kernel='rbf', gamma=1., coef0=1., self.L = lamba * (np.linalg.pinv(self.RK + lamba * np.eye(n*m))) - @inherit_docstring_from(QueryStrategy) - def make_query(self): - dataset = self.dataset - X, Y = zip(*dataset.get_entries()) - _, lbled_Y = zip(*dataset.get_labeled_entries()) - - X = np.array(X) - RK = self.RK - n_instance = len(X) - m = np.shape(lbled_Y)[1] - lamba = self.lamba - + def _get_index(self): + _, Y = zip(*self.dataset.get_entries()) + n_instance = len(Y) + m = self.n_labels # index for labeled and unlabeled instance l_id = [] a_id = [] for i in range(n_instance * m): - if Y[i%n_instance] is None: + if Y[i // m] is None: a_id.append(i) else: l_id.append(i) + return a_id, l_id + + #def update(self, entry_id, label): + # # calculate invLaa + # invLaa = self.invLaa + # # idx before update + # a_id, l_id = self.idxs + # m = len(label) + # # assert len(np.where(np.array(a_id) == entry_id*m)[0]) == 1 + # idx = np.where(np.array(a_id) == entry_id*m)[0][0] + # for i in range(m): + # D = np.delete(np.delete(invLaa, idx, axis=0), idx, axis=1) + # b = np.delete(invLaa, idx, axis=0)[:, idx] + # # invLuu + # invLaa = D - 1./invLaa[idx, idx] * np.dot(b, b.T) + # self.invLaa = invLaa + + @inherit_docstring_from(QueryStrategy) + def make_query(self): + dataset = self.dataset + X, Y = zip(*dataset.get_entries()) + X = np.array(X) + n_instance = len(X) + m = self.n_labels + RK = self.RK + lamba = self.lamba L = self.L - vecY = np.reshape(np.array([y for y in Y if y is not None]).T, (-1, 1)) - detLaa = np.linalg.det(L[np.ix_(a_id, a_id)]) - #invLaa = np.linalg.pinv(L[np.ix_(a_id, a_id)]) - invLaa = (lamba * np.eye(len(a_id)) + RK[np.ix_(a_id, a_id)]) \ + + a_id, l_id = self._get_index() + # invLaa = np.linalg.pinv(L[np.ix_(a_id, a_id)]) + invLaa = ((lamba * np.eye(len(a_id)) + RK[np.ix_(a_id, a_id)]) \ - np.dot(np.dot(RK[np.ix_(a_id, l_id)], np.linalg.pinv(lamba * np.eye(len(l_id)) \ + RK[np.ix_(l_id, l_id)])), - RK[np.ix_(l_id, a_id)]) + RK[np.ix_(l_id, a_id)])) / lamba + + vecY = np.reshape(np.array([y for y in Y if y is not None]).T, (-1, 1)) + detLaa = np.linalg.det(L[np.ix_(a_id, a_id)]) + score = np.zeros(len(a_id)) b = np.zeros((len(a_id)-1)) - score = [] D = np.zeros((len(a_id)-1, len(a_id)-1)) D[...] = invLaa[1:, 1:] for i, s in enumerate(a_id): @@ -162,13 +181,13 @@ def make_query(self): b[i:] = invLaa[i+1:, i] invLuu = D - 1./invLaa[i, i] * np.dot(b, b.T) - score.append(L[s, s] - detLaa / L[s, s] \ - + 2 * np.abs(np.dot(L[s, l_id] \ - - np.dot(np.dot(L[s, u_id], invLuu), - L[np.ix_(u_id, l_id)]), vecY))) + score[i] = L[s, s] - detLaa / L[s, s] \ + + 2 * np.abs(np.dot(L[s, l_id] \ + - np.dot(np.dot(L[s, u_id], invLuu), + L[np.ix_(u_id, l_id)]), vecY)) - score = np.sum(np.array(score).reshape(m, -1).T, axis=1) + score = np.sum(score.reshape(m, -1).T, axis=1) ask_idx = self.random_state_.choice(np.where(score == np.min(score))[0]) - return a_id[ask_idx] + return a_id[ask_idx] // m diff --git a/libact/query_strategies/multilabel/tests/test_multilabel_quire.py b/libact/query_strategies/multilabel/tests/test_multilabel_quire.py new file mode 100644 index 00000000..17040d15 --- /dev/null +++ b/libact/query_strategies/multilabel/tests/test_multilabel_quire.py @@ -0,0 +1,26 @@ +import unittest + +from numpy.testing import assert_array_equal +import numpy as np + +from libact.base.dataset import Dataset +from libact.query_strategies.multilabel import MultilabelQUIRE +from libact.utils import run_qs + + +class MultilabelQUIRETestCase(unittest.TestCase): + """Variance reduction test case using artifitial dataset""" + def setUp(self): + self.X = [[-2, -1], [1, 1], [-1, -2], [-1, -1], [1, 2], [2, 1]] + self.y = [[0, 1], [1, 0], [0, 1], [1, 0], [1, 0], [1, 1]] + self.quota = 4 + + def test_multilabel_quire(self): + trn_ds = Dataset(self.X, (self.y[:2] + [None] * (len(self.y) - 2))) + qs = MultilabelQUIRE(trn_ds) + qseq = run_qs(trn_ds, qs, self.y, self.quota) + assert_array_equal(qseq, np.array([2, 3, 4, 5])) + + +if __name__ == '__main__': + unittest.main() diff --git a/libact/utils/__init__.py b/libact/utils/__init__.py index d58fc0dc..93af3f0a 100644 --- a/libact/utils/__init__.py +++ b/libact/utils/__init__.py @@ -50,3 +50,33 @@ def calc_cost(y, yhat, cost_matrix): ith class and prediction as jth class. """ return np.mean(cost_matrix[list(y), list(yhat)]) + +def run_qs(trn_ds, qs, truth, quota): + """Run query strategy on specified dataset and return quering sequence. + + Parameters + ---------- + trn_ds : Dataset object + The dataset to be run on. + + qs : QueryStrategy instance + The active learning algorith to be run. + + truth : array-like + The true label. + + quota : int + Number of iterations to run + + Returns + ------- + qseq : numpy array, shape (quota,) + The numpy array of entry_id representing querying sequence. + """ + ret = [] + for _ in range(quota): + ask_id = qs.make_query() + trn_ds.update(ask_id, truth[ask_id]) + + ret.append(ask_id) + return np.array(ret)