Source code for etudes.datasets.synthetic

import numpy as np
import tensorflow_probability as tfp

from sklearn.utils import check_random_state, shuffle as _shuffle

tfd = tfp.distributions


[docs]def synthetic_sinusoidal(x):

    return np.sin(12.0*x) + 0.66*np.cos(25.0*x)


[docs]def make_regression_dataset(latent_fn=synthetic_sinusoidal):
    """
    Make synthetic dataset.

    Examples
    --------

    Test

    .. plot::
        :context: close-figs

        from etudes.datasets import synthetic_sinusoidal, make_regression_dataset

        num_train = 64 # nbr training points in synthetic dataset
        num_index_points = 256
        num_features = 1
        observation_noise_variance = 1e-1

        f = synthetic_sinusoidal
        X_pred = np.linspace(-0.6, 0.6, num_index_points).reshape(-1, num_features)

        load_data = make_regression_dataset(f)
        X_train, Y_train = load_data(num_train, num_features,
                                     observation_noise_variance,
                                     x_min=-0.5, x_max=0.5)

        fig, ax = plt.subplots()

        ax.plot(X_pred, f(X_pred), label="true")
        ax.scatter(X_train, Y_train, marker='x', color='k',
                    label="noisy observations")

        ax.legend()

        ax.set_xlabel(r'$x$')
        ax.set_ylabel(r'$y$')

        plt.show()
    """

    def load_data(num_samples, num_features, noise_variance,
                  x_min=0., x_max=1., squeeze=True, random_state=None):

        rng = check_random_state(random_state)

        eps = noise_variance * rng.randn(num_samples, num_features)

        X = x_min + (x_max - x_min) * rng.rand(num_samples, num_features)
        Y = latent_fn(X) + eps

        if squeeze:
            Y = np.squeeze(Y)

        return X, Y

    return load_data


[docs]def make_classification_dataset(X_pos, X_neg, shuffle=False, dtype="float64",
                                random_state=None):

    X = np.vstack([X_pos, X_neg]).astype(dtype)
    y = np.hstack([np.ones(len(X_pos)), np.zeros(len(X_neg))])

    if shuffle:
        X, y = _shuffle(X, y, random_state=random_state)

    return X, y


[docs]def make_density_ratio_estimation_dataset(p=None, q=None):

    if p is None:
        p = tfd.MixtureSameFamily(
            mixture_distribution=tfd.Categorical(probs=[0.3, 0.7]),
            components_distribution=tfd.Normal(loc=[2.0, -3.0],
                                               scale=[1.0, 0.5]))

    if q is None:
        q = tfd.Normal(loc=0.0, scale=2.0)

    def load_data(num_samples, rate=0.5, dtype="float64", seed=None):

        num_p = int(num_samples * rate)
        num_q = num_samples - num_p

        X_p = p.sample(sample_shape=(num_p, 1), seed=seed).numpy()
        X_q = q.sample(sample_shape=(num_q, 1), seed=seed).numpy()

        X, y = make_classification_dataset(X_p, X_q, dtype=dtype,
                                           random_state=seed)

        return X, y

    return load_data