diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 8e131aed..76eba228 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,6 +1,7 @@
 Changelog
 =========
 
+- Add RobustGPyRegression model that can handle non-finite output values
 - Update surrogate model initialisation to use all initial evidence
 - Update BOLFI and BOLFIRE to use a shared sample class that returns individual chains in the arviz inference data
 - Use kernel copy to avoid pickle issue and allow BOLFI parallelisation with non-default kernel
diff --git a/elfi/__init__.py b/elfi/__init__.py
index 0e0a17b9..b6e75989 100644
--- a/elfi/__init__.py
+++ b/elfi/__init__.py
@@ -26,7 +26,7 @@
 from elfi.visualization.visualization import nx_draw as draw
 from elfi.visualization.visualization import plot_params_vs_node
 from elfi.visualization.visualization import plot_predicted_summaries
-from elfi.methods.bo.gpy_regression import GPyRegression
+from elfi.methods.bo.gpy_regression import GPyRegression, RobustGPyRegression
 
 __author__ = 'ELFI authors'
 __email__ = 'elfi-support@hiit.fi'
diff --git a/elfi/methods/bo/acquisition.py b/elfi/methods/bo/acquisition.py
index 0f44c656..17bbcacd 100644
--- a/elfi/methods/bo/acquisition.py
+++ b/elfi/methods/bo/acquisition.py
@@ -435,7 +435,7 @@ def evaluate_gradient(self, theta_new, t=None):
         a = (self.eps - mean) / scale
         b = np.sqrt(sigma2_n) / np.sqrt(sigma2_n + 2 * var)
         grad_a = (-1. / scale) * grad_mean - \
-            ((self.eps - mean) / (2. * (sigma2_n + var)**(1.5))) * grad_var
+            np.nan_to_num((self.eps - mean) / (2. * (sigma2_n + var)**(1.5))) * grad_var
         grad_b = (-np.sqrt(sigma2_n) / (sigma2_n + 2 * var)**(1.5)) * grad_var
 
         _phi_a = phi(a)
diff --git a/elfi/methods/bo/gpy_regression.py b/elfi/methods/bo/gpy_regression.py
index a44d63be..d921f811 100644
--- a/elfi/methods/bo/gpy_regression.py
+++ b/elfi/methods/bo/gpy_regression.py
@@ -41,7 +41,7 @@ def __init__(self,
             Alternatives: "scg", "fmin_tnc", "simplex", "lbfgsb", "lbfgs", "sgd"
             See also: paramz.Model.optimize()
         max_opt_iters : int, optional
-        gp : GPy.model.GPRegression instance, optional
+        gp : GPy.models.GPRegression instance, optional
         **gp_params
             kernel : GPy.Kern
             noise_var : float
@@ -362,3 +362,250 @@ def copy(self):
             kopy.gp_params['mean_function'] = self.gp_params['mean_function'].copy()
 
         return kopy
+
+
+class RobustGPyRegression(GPyRegression):
+    """Robust Gaussian Process regression using the GPy library.
+
+    Restricts regression model to use evidence with finite output values. Simulations that
+    produce non-finite output are considered failed.
+    """
+
+    def __init__(self,
+                 parameter_names=None,
+                 bounds=None,
+                 train_classifier=False,
+                 thd=0,
+                 clf=None,
+                 clf_kernel=None,
+                 **kwargs):
+        """Initialize RobustGPyRegression.
+
+        Parameters
+        ----------
+        parameter_names : list of str, optional
+            Names of parameter nodes. If None, sets dimension to 1.
+        bounds : dict, optional
+            The region where to estimate the posterior for each parameter in
+            model.parameters.
+            `{'parameter_name':(lower, upper), ... }`
+            If not supplied, defaults to (0, 1) bounds for all dimensions.
+        train_classifier : boolean, optional
+            Train a classifier to predict finite output probabilities.
+        thd : float, optional
+            Threshold value used to exclude inputs that return non-finite output values.
+        clf: GPy.models.GPClassification instance, optional
+            Differentiates between inputs that return finite (1) and non-finite (0) outputs.
+        clf_kernel : GPy.Kern, optional
+            Kernel function, defaults to RBF.
+        **kwargs: see GPyRegression
+
+        """
+        super().__init__(parameter_names, bounds, **kwargs)
+
+        self._X = np.zeros((0, self.input_dim))
+        self._Y = np.zeros((0, 1))
+
+        self.thd = thd
+        self.train_classifier = self.thd > 0 or train_classifier
+
+        self._clf = clf
+        self._clf_kernel = clf_kernel or GPy.kern.RBF(self.input_dim, ARD=True)
+
+        self.FAILED_OUTPUT = np.inf
+        self.FAILED_VAR = 0.00001
+
+    def success_proba(self, x):
+        """Return predicted finite output probabilities at x.
+
+        Parameters
+        ----------
+        x : np.array
+            numpy compatible (n, input_dim) array of points to evaluate
+
+        Returns
+        -------
+        np.array
+            with shape (x.shape[0], 1)
+
+        """
+        # Ensure it's 2d for GPy
+        x = np.asanyarray(x).reshape((-1, self.input_dim))
+
+        if self._clf is not None:
+            return self._clf.predict(x)[0]
+        else:
+            return np.ones((len(x), 1))
+
+    def success_proba_gradients(self, x):
+        """Return predicted finite output probability gradients at x.
+
+        Parameters
+        ----------
+        x : np.array
+            numpy compatible (n, input_dim) array of points to evaluate
+
+        Returns
+        -------
+        np.array
+            with shape (x.shape[0], input_dim)
+
+        """
+        # Ensure it's 2d for GPy
+        x = np.asanyarray(x).reshape((-1, self.input_dim))
+
+        if self._clf is not None:
+            return self._clf.predictive_gradients(x)[0][:, :, 0]
+        else:
+            return np.zeros_like((x))
+
+    def predict(self, x, **kwargs):
+        """Return predicted mean and variance at x.
+
+        Parameters
+        ----------
+        x : np.array
+            numpy compatible (n, input_dim) array of points to evaluate
+            if len(x.shape) == 1 will be cast to 2D with x[None, :]
+
+        Returns
+        -------
+        tuple
+            GP (mean, var) at x where
+                mean : np.array
+                    with shape (x.shape[0], 1)
+                var : np.array
+                    with shape (x.shape[0], 1)
+
+        """
+        # Ensure it's 2d for GPy
+        x = np.asanyarray(x).reshape((-1, self.input_dim))
+        mean, var = super().predict(x)
+
+        if self.thd > 0 and self._clf is not None:
+            mask = (self.success_proba(x) < self.thd).reshape(-1)
+            mean[mask] = self.FAILED_OUTPUT
+            var[mask] = self.FAILED_VAR
+
+        return mean, var
+
+    def predictive_gradients(self, x):
+        """Return the gradients of predicted mean and variance at x.
+
+        Parameters
+        ----------
+        x : np.array
+            numpy compatible (n, input_dim) array of points to evaluate
+            if len(x.shape) == 1 will be cast to 2D with x[None, :]
+
+        Returns
+        -------
+        tuple
+            GP (grad_mean, grad_var) at x where
+                grad_mean : np.array
+                    with shape (x.shape[0], input_dim)
+                grad_var : np.array
+                    with shape (x.shape[0], input_dim)
+
+        """
+        # Ensure it's 2d for GPy
+        x = np.asanyarray(x).reshape((-1, self.input_dim))
+        grad_mean, grad_var = super().predictive_gradients(x)
+
+        if self.thd > 0 and self._clf is not None:
+            mask = (self.success_proba(x) < self.thd).reshape(-1)
+            grad_mean[mask] = 0
+            grad_var[mask] = 0
+
+        return grad_mean, grad_var
+
+    def _make_classifier_instance(self, x, y, kernel):
+        return GPy.models.GPClassification(x, y, kernel=kernel)
+
+    def update(self, x, y, optimize=False):
+        """Update the surrogate model with new data.
+
+        Parameters
+        ----------
+        x : np.array
+        y : np.array
+        optimize : bool, optional
+            Whether to optimize hyperparameters.
+
+        """
+        # Must cast these as 2d for GPy
+        x = x.reshape((-1, self.input_dim))
+        y = y.reshape((-1, 1))
+
+        self._X = np.r_[self._X, x]
+        self._Y = np.r_[self._Y, y]
+
+        # Update regression model without failed simulations
+        mask = np.isfinite(y).reshape(-1)
+        if mask.any():
+            super().update(x[mask], y[mask], optimize=False)
+        elif self._gp is None:
+            raise RuntimeError("No finite outputs available for model initialisation.")
+
+        # Update classification model
+        if self.train_classifier:
+            labels = np.isfinite(self._Y).astype(int)
+            if self._clf is not None:
+                # Reconstruct with new data
+                self._clf = self._make_classifier_instance(self._X, labels, self._clf.kern.copy())
+            elif not mask.all():
+                # Initialise
+                self._clf = self._make_classifier_instance(self._X, labels, self._clf_kernel)
+
+        if optimize:
+            self.optimize()
+
+    def optimize(self):
+        """Optimize GP hyperparameters."""
+        super().optimize()
+        if self._clf is not None:
+            try:
+                self._clf.optimize(self.optimizer, max_iters=self.max_opt_iters)
+            except np.linalg.linalg.LinAlgError:
+                logger.warning("Numerical error in GP optimization. Stopping optimization")
+
+    @property
+    def n_evidence(self):
+        """Return the number of observed samples."""
+        return len(self._Y)
+
+    @property
+    def n_valid_evidence(self):
+        """Return the number of valid observed samples."""
+        return np.sum(np.isfinite(self._Y))
+
+    @property
+    def X(self):
+        """Return input evidence."""
+        return self._X
+
+    @property
+    def Y(self):
+        """Return output evidence."""
+        Y = self._Y.copy()
+        Y[~np.isfinite(Y)] = self.FAILED_OUTPUT
+        return Y
+
+    @property
+    def classifier_instance(self):
+        """Return the classifier instance."""
+        return self._clf
+
+    def copy(self):
+        """Return a copy of current instance."""
+        kopy = super().copy()
+
+        if self._clf:
+            x = self._clf.X.copy()
+            y = self._clf.Y.copy()
+            kopy._clf = self._make_classifier_instance(x, y, self._clf.kern.copy())
+
+        if self._clf_kernel:
+            kopy._clf_kernel = self._clf_kernel.copy()
+
+        return kopy
diff --git a/elfi/visualization/visualization.py b/elfi/visualization/visualization.py
index d6c29eca..9ca870c2 100644
--- a/elfi/visualization/visualization.py
+++ b/elfi/visualization/visualization.py
@@ -450,8 +450,10 @@ def plot_gp(gp, parameter_names, axes=None, resol=50,
     shape = (n_plots, n_plots)
     axes, kwargs = _create_axes(axes, shape, **kwargs)
 
-    x_evidence = gp.X
-    y_evidence = gp.Y
+    valid_inds = np.isfinite(gp.Y).squeeze()
+    x_evidence = gp.X[valid_inds]
+    y_evidence = gp.Y[valid_inds]
+    x_evidence_failed = gp.X[~valid_inds]
     if const is None:
         const = x_evidence[np.argmin(y_evidence), :]
     bounds = bounds or gp.bounds
@@ -488,6 +490,13 @@ def plot_gp(gp, parameter_names, axes=None, resol=50,
                                      color="red",
                                      alpha=0.7,
                                      s=5)
+                if len(x_evidence_failed) > 0:
+                    axes[jy, ix].scatter(x_evidence_failed[:, ix],
+                                         x_evidence_failed[:, jy],
+                                         marker="^",
+                                         color="blue",
+                                         alpha=0.5,
+                                         s=7)
 
                 if true_params is not None:
                     axes[jy, ix].plot([true_params[parameter_names[ix]],
diff --git a/tests/functional/test_failure_robust.py b/tests/functional/test_failure_robust.py
new file mode 100644
index 00000000..e2d5292b
--- /dev/null
+++ b/tests/functional/test_failure_robust.py
@@ -0,0 +1,54 @@
+from functools import partial
+
+import numpy as np
+import pytest
+
+import elfi
+from elfi.examples.gauss import euclidean_multidim, gauss_nd_mean
+
+
+def toy_sim(*mu, cov, batch_size=1, random_state=None):
+    out = gauss_nd_mean(*mu, cov_matrix=cov, batch_size=batch_size, random_state=random_state)
+    failed = np.sqrt(np.sum(np.power(mu, 2), axis=0)) < 2.5
+    out[failed] = np.nan
+    return out
+
+
+def get_model(true_params, seed):
+    cov = (0.1 * np.diag(np.ones((2,))) + 0.5 * np.ones((2, 2)))
+    sim = partial(toy_sim, cov=cov)
+
+    random_state = np.random.RandomState(seed)
+    obs = sim(*true_params, random_state=random_state)
+
+    m = elfi.ElfiModel()
+    mu_1 = elfi.Prior('uniform', 0, 5, model=m)
+    mu_2 = elfi.Prior('uniform', 0, 5, model=m)
+    y = elfi.Simulator(sim, mu_1, mu_2, observed=obs)
+    mean = elfi.Summary(partial(np.mean, axis=1), y)
+    d = elfi.Discrepancy(euclidean_multidim, mean)
+    return m
+
+
+@pytest.mark.slowtest
+def test_failure_robust_BOLFI():
+    seed = 123
+    true_params = [3, 3]
+    m = get_model(true_params, seed)
+    bounds = {'mu_1': (0, 5), 'mu_2': (0, 5)}
+    target_model = elfi.RobustGPyRegression(m.parameter_names, bounds=bounds, thd=0.5)
+    bolfi = elfi.BOLFI(m['d'], initial_evidence=20, target_model=target_model, seed=seed)
+    post = bolfi.fit(n_evidence=100)
+
+    # check that optimisation avoided infeasible parameter combinations
+    assert bolfi.target_model.n_valid_evidence > 90
+
+    # check model minimum
+    res_1 = bolfi.extract_result()
+    assert np.isclose(res_1.x_min['mu_1'], 3, atol=0.35)
+    assert np.isclose(res_1.x_min['mu_2'], 3, atol=0.35)
+
+    # check posterior mean
+    res_2 = bolfi.sample(500)
+    assert np.isclose(np.mean(res_2.samples['mu_1']), 3, atol=0.35)
+    assert np.isclose(np.mean(res_2.samples['mu_2']), 3, atol=0.35)
diff --git a/tests/unit/test_failure_robust_model.py b/tests/unit/test_failure_robust_model.py
new file mode 100644
index 00000000..c0b4b47c
--- /dev/null
+++ b/tests/unit/test_failure_robust_model.py
@@ -0,0 +1,132 @@
+from functools import partial
+
+import numpy as np
+import pytest
+
+import elfi
+from elfi.examples.gauss import euclidean_multidim, gauss_nd_mean
+
+
+def toy_sim(*mu, cov, batch_size=1, random_state=None):
+    out = gauss_nd_mean(*mu, cov_matrix=cov, batch_size=batch_size, random_state=random_state)
+    failed = np.sqrt(np.sum(np.power(mu, 2), axis=0)) < 2.5
+    out[failed] = np.nan
+    return out
+
+
+def get_model(true_params, seed):
+    cov = (0.1 * np.diag(np.ones((2,))) + 0.5 * np.ones((2, 2)))
+    sim = partial(toy_sim, cov=cov)
+
+    random_state = np.random.RandomState(seed)
+    obs = sim(*true_params, random_state=random_state)
+
+    m = elfi.ElfiModel()
+    mu_1 = elfi.Prior('uniform', 0, 5, model=m)
+    mu_2 = elfi.Prior('uniform', 0, 5, model=m)
+    y = elfi.Simulator(sim, mu_1, mu_2, observed=obs)
+    mean = elfi.Summary(partial(np.mean, axis=1), y)
+    d = elfi.Discrepancy(euclidean_multidim, mean)
+    return m
+
+
+@pytest.fixture(scope='module')
+def model_1():
+    seed = 123
+    true_params = [3, 3]
+    m = get_model(true_params, seed)
+    data = m.generate(20, seed=seed)
+    X = np.column_stack((data['mu_1'], data['mu_2']))
+    Y = data['d']
+    bounds = {'mu_1': (0, 5), 'mu_2': (0, 5)}
+    target_model = elfi.RobustGPyRegression(m.parameter_names, bounds=bounds)
+    target_model.update(X, Y)
+    return target_model
+
+
+@pytest.fixture(scope='module')
+def model_2():
+    seed = 123
+    true_params = [3, 3]
+    m = get_model(true_params, seed)
+    data = m.generate(20, seed=seed)
+    X = np.column_stack((data['mu_1'], data['mu_2']))
+    Y = data['d']
+    bounds = {'mu_1': (0, 5), 'mu_2': (0, 5)}
+    target_model = elfi.RobustGPyRegression(m.parameter_names, bounds=bounds, thd=0.5)
+    target_model.update(X, Y)
+    return target_model
+
+
+def test_update():
+    X = np.random.rand(1, 2)
+    Y = np.ones((1, 1))
+    target_model = elfi.RobustGPyRegression(['param_1', 'param_2'], thd=0.5)
+    target_model.update(X, Y)
+    assert np.all(target_model.X == X)
+    assert np.all(target_model.Y == Y)
+
+
+def test_cannot_update():
+    X = np.random.rand(1, 2)
+    Y = np.nan * np.ones((1, 1))
+    target_model = elfi.RobustGPyRegression(['param_1', 'param_2'], thd=0.5)
+    with pytest.raises(RuntimeError):
+        target_model.update(X, Y)
+
+
+@pytest.mark.parametrize('model', ['model_1', 'model_2'])
+def test_n_evidence(model, request):
+    target_model = request.getfixturevalue(model)
+    assert target_model.n_evidence == 20
+    assert target_model.n_valid_evidence < target_model.n_evidence
+
+
+@pytest.mark.parametrize('model', ['model_1', 'model_2'])
+def test_predict(model, request):
+    target_model = request.getfixturevalue(model)
+    mu1, _ = target_model.predict([3, 3])
+    mu2, _ = target_model.predict([4, 4])
+    assert mu1 < mu2
+
+
+def test_predict_infeasible(model_2):
+    target_model = model_2
+    pred = target_model.predict([0, 0])
+    assert pred[0] == target_model.FAILED_OUTPUT
+    assert pred[1] == target_model.FAILED_VAR
+
+
+def test_predict_gradients_infeasible(model_2):
+    target_model = model_2
+    grad = target_model.predictive_gradients([0, 0])
+    assert grad[0].shape == (1, 2)
+    assert grad[1].shape == (1, 2)
+    assert np.all(grad[0] == 0)
+    assert np.all(grad[1] == 0)
+
+
+def test_success_proba(model_2):
+    target_model = model_2
+    prob1 = target_model.success_proba([4, 4])
+    prob2 = target_model.success_proba([2, 2])
+    assert prob1 > prob2
+
+
+def test_success_proba_default(model_1):
+    target_model = model_1
+    prob = target_model.success_proba([3, 3])
+    assert float(prob) == 1
+
+
+def test_success_proba_gradients(model_2):
+    target_model = model_2
+    grad = target_model.success_proba_gradients([0, 0])
+    assert grad.shape == (1, 2)
+
+
+def test_success_proba_gradients_default(model_1):
+    target_model = model_1
+    grad = target_model.success_proba_gradients([0, 0])
+    assert grad.shape == (1, 2)
+    assert np.all(grad == 0)