Source code for cyanure_pytorch.data_processing

"""Contain the functions concerning the processing of data."""

import warnings
import numbers
import platform

import numpy as np
import scipy.sparse

from scipy.sparse import issparse
from scipy.sparse import dok_matrix
from scipy.sparse import lil_matrix

from sklearn.exceptions import DataConversionWarning
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import check_array, _assert_all_finite
from sklearn.preprocessing import normalize as skNormalize

from collections.abc import Sequence

from cyanure_pytorch.logger import setup_custom_logger

logger = setup_custom_logger("INFO")


[docs]def preprocess(X, centering=False, normalize=True, columns=True):
    """
        Preprocess features training data.

        Perform in-place centering or normalization, either of columns or rows
        of the input matrix X.

        Parameters
        ----------
            X (numpy array or scipy sparse CSR matrix):
                Input matrix

            centering (boolean) : default=False
                Perform a centering operation

            normalize (boolean): default=True
                l2-normalization

    """

    training_data_fortran = np.asfortranarray(X)

    if scipy.sparse.issparse(X):
        raise TypeError("The library does not supports sparse data.")

    if columns:
        if centering:
            column_means = np.mean(training_data_fortran, axis=0, keepdims=True)
            training_data_fortran = training_data_fortran - column_means
        if normalize:
            training_data_fortran = skNormalize(training_data_fortran, axis=0, norm="l2")
    else:
        if centering:
            row_means = np.mean(training_data_fortran, axis=1, keepdims=True)
            training_data_fortran = training_data_fortran - row_means
        if normalize:
            training_data_fortran = skNormalize(training_data_fortran, axis=1, norm="l2")

    return training_data_fortran


def sklearn_catch_warnings(y, check_y_kwargs):
    with warnings.catch_warnings():
        if not issparse(y):
            try:
                y = check_array(y, dtype=None, **check_y_kwargs)
            except (np.VisibleDeprecationWarning, ValueError) as e:
                if str(e).startswith("Complex data not supported"):
                    raise

                # dtype=object should be provided explicitly for ragged arrays,
                # see NEP 34
                y = check_array(y, dtype=object, **check_y_kwargs)


def sklearn_check_invalid_inputs(y):
    # Invalid inputs
    if y.ndim not in (1, 2):
        # Number of dimension greater than 2: [[[1, 2]]]
        return "unknown"
    if not min(y.shape):
        # Empty ndarray: []/[[]]
        if y.ndim == 1:
            # 1-D empty array: []
            return "binary"  # []
        # 2-D empty array: [[]]
        return "unknown"
    if not issparse(y) and y.dtype == object and not isinstance(y.flat[0], str):
        # [obj_1] and not ["label_1"]
        return "unknown"


def sklearn_check_old_format(y):
    # The old sequence of sequences format
    try:
        if (
            not hasattr(y[0], "__array__")
            and isinstance(y[0], Sequence)
            and not isinstance(y[0], str)
        ):
            raise ValueError(
                "Sequence of sequences are not"
                " supported; use a binary array or sparse"
                " matrix instead."
            )
    except IndexError:
        pass


# Code from scikit-learn
def type_of_target(y, input_name=""):
    """Determine the type of data indicated by the target.
    Note that this type is the most specific type that can be inferred.
    For example:
        * ``binary`` is more specific but compatible with ``multiclass``.
        * ``multiclass`` of integers is more specific but compatible with
          ``continuous``.
        * ``multilabel-indicator`` is more specific but compatible with
          ``multiclass-multioutput``.
    Parameters
    ----------
    y : {array-like, sparse matrix}
        Target values. If a sparse matrix, `y` is expected to be a
        CSR/CSC matrix.
    input_name: str, default=""
        The data name used to construct the error message.
        .. versionadded:: 1.1.0
    Returns
    -------
    target_type: str
        One of:
        * 'continuous': `y` is an array-like of floats that are not all
          integers, and is 1d or a column vector.
        * 'continuous-multioutput': `y` is a 2d array of floats that are
          not all integers, and both dimensions are of size > 1.
        * 'binary': `y` contains <= 2 discrete values and is 1d or a column
          vector.
        * 'multiclass': `y` contains more than two discrete values, is not a
          sequence of sequences, and is 1d or a column vector.
        * 'multiclass-multioutput': `y` is a 2d array that contains more
          than two discrete values, is not a sequence of sequences, and both
          dimensions are of size > 1.
        * 'multilabel-indicator': `y` is a label indicator matrix, an array
          of two dimensions with at least two columns, and at most 2 unique
          values.
        * 'unknown': `y` is array-like but none of the above, such as a 3d
          array, sequence of sequences, or an array of non-sequence objects.
    Examples
    --------
    >>> from sklearn.utils.multiclass import type_of_target
    >>> import numpy as np
    >>> type_of_target([0.1, 0.6])
    'continuous'
    >>> type_of_target([1, -1, -1, 1])
    'binary'
    >>> type_of_target(['a', 'b', 'a'])
    'binary'
    >>> type_of_target([1.0, 2.0])
    'binary'
    >>> type_of_target([1, 0, 2])
    'multiclass'
    >>> type_of_target([1.0, 0.0, 3.0])
    'multiclass'
    >>> type_of_target(['a', 'b', 'c'])
    'multiclass'
    >>> type_of_target(np.array([[1, 2], [3, 1]]))
    'multiclass-multioutput'
    >>> type_of_target([[1, 2]])
    'multilabel-indicator'
    >>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))
    'continuous-multioutput'
    >>> type_of_target(np.array([[0, 1], [1, 1]]))
    'multilabel-indicator'
    """
    valid = (
        (isinstance(y, Sequence) or issparse(y) or hasattr(y, "__array__"))
        and not isinstance(y, str)
    )

    if not valid:
        raise ValueError(
            "Expected array-like (array or non-string sequence), got %r" % y
        )

    sparse_pandas = y.__class__.__name__ in ["SparseSeries", "SparseArray"]
    if sparse_pandas:
        raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")

    if is_multilabel(y):
        return "multilabel-indicator"

    # DeprecationWarning will be replaced by ValueError, see NEP 34
    # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html
    # We therefore catch both deprecation (NumPy < 1.24) warning and
    # value error (NumPy >= 1.24).
    check_y_kwargs = dict(
        accept_sparse=True,
        allow_nd=True,
        force_all_finite=False,
        ensure_2d=False,
        ensure_min_samples=0,
        ensure_min_features=0,
    )

    sklearn_catch_warnings(y, check_y_kwargs)

    sklearn_check_old_format(y)

    sklearn_check_invalid_inputs(y)

    # Check if multioutput
    if y.ndim == 2 and y.shape[1] > 1:
        suffix = "-multioutput"  # [[1, 2], [1, 2]]
    else:
        suffix = ""  # [1, 2, 3] or [[1], [2], [3]]

    # Check float and contains non-integer float values
    if y.dtype.kind == "f":
        # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
        data = y.data if issparse(y) else y
        if np.any(data != np.floor(data)):
            _assert_all_finite(data)
            return "continuous" + suffix

    # Check multiclass
    first_row = y[0] if not issparse(y) else y.getrow(0).data
    if np.unique(y).shape[0] > 2 or (y.ndim == 2 and len(first_row) > 1):
        # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
        return "multiclass" + suffix
    else:
        return "binary"  # [1, 2] or [["a"], ["b"]]


# Code from scikit-learn
def is_multilabel(y):
    """Check if ``y`` is in a multilabel format.
    Parameters
    ----------
    y : ndarray of shape (n_samples,)
        Target values.
    Returns
    -------
    out: bool
        Return ``True``, if ``y`` is in a multilabel format, else ```False``.
    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.utils.multiclass import is_multilabel
    >>> is_multilabel([0, 1, 0, 1])
    False
    >>> is_multilabel([[1], [0, 2], []])
    False
    >>> is_multilabel(np.array([[1, 0], [0, 0]]))
    True
    >>> is_multilabel(np.array([[1], [0], [0]]))
    False
    >>> is_multilabel(np.array([[1, 0, 0]]))
    True
    """
    if hasattr(y, "__array__") or isinstance(y, Sequence):
        # DeprecationWarning will be replaced by ValueError, see NEP 34
        # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html
        check_y_kwargs = dict(
            accept_sparse=True,
            allow_nd=True,
            force_all_finite=False,
            ensure_2d=False,
            ensure_min_samples=0,
            ensure_min_features=0,
        )
        sklearn_catch_warnings(y, check_y_kwargs)

    if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):
        return False

    if issparse(y):
        if isinstance(y, (dok_matrix, lil_matrix)):
            y = y.tocsr()
        labels = np.unique(y.data)
        return (
            len(y.data) == 0
            or (labels.size == 1 or (labels.size == 2) and (0 in labels))
            and (np.dtype.kind in "biu" or _is_integral_float(labels))  # bool, int, uint
        )
    else:
        labels = np.unique(y)

        return len(labels) < 3 and (
            y.dtype.kind in "biu" or _is_integral_float(labels)  # bool, int, uint
        )


# Code from scikit-learn
def _is_integral_float(y):
    return y.dtype.kind == "f" and np.all(y.astype(int) == y)


[docs]def check_labels(labels, estimator):
    """
    Verify the format of labels depending on the type of the estimator.

    Can convert labels in some cases.

    Parameters
    ----------
        labels (numpy array or scipy sparse CSR matrix):
            Numpy array containing labels

        estimator (ERM):
            The estimator which will be fitted

    Raises
    ------
        ValueError:
            Format of the labels does not respect the format supported by Cyanure classifiers.

        ValueError:
            Labels have an non finite value

        ValueError:
            Problem has only one class

    Returns
    -------
        labels (numpy array or scipy sparse CSR matrix):
            Converted labels if required by the estimator.

        label_encoder (sklearn.LabelEncoder):
            Convert text labels if needed
    """
    label_encoder = None

    if estimator.__sklearn_tags__().estimator_type == "classifier":

        if np.issubdtype(type(labels[0]), np.str_):
            label_encoder = LabelEncoder()
            label_encoder.fit(labels)
            labels = label_encoder.transform(labels)

        y_type = type_of_target(labels)
        if y_type not in [
            "binary",
            "multiclass"
        ]:
            raise ValueError("Unknown label type: %r" % y_type)

    else:
        if type(labels[0]) not in (np.float32, np.float64):
            logger.info("The labels have been converted to float64")
            labels = labels.astype('float64')

    _assert_all_finite(labels)

    if estimator.__sklearn_tags__().estimator_type == "classifier" and len(np.unique(labels)) == 1:
        raise ValueError("Classifier can't train when only one class is present.")

    return labels, label_encoder


def check_is_finite(array_to_test):
    if not np.any(np.isfinite(array_to_test)):
        raise ValueError(
            "Input contains NaN, infinity or a value too large for dtype('float64').")


def get_element(array):
    """
    Get an element from an array of any depth.

    Args
    ----
        array (Type of the element):
            Array we want to get an element

    Returns
    -------
        Type of the element:
            One of the element of the array
    """
    element = array[0]
    for i in range(len(array.shape) - 1):
        element = element[i]
    return element


[docs]def check_input_type(X, labels, estimator):
    """
    Verify the format of labels and features depending on the type of the estimator.

    Can convert labels in some cases.

    Parameters
    ----------
        X (numpy array or scipy sparse CSR matrix):
            Numpy array containing features

        labels (numpy array or scipy sparse CSR matrix):
            Numpy array containing labels

        estimator (ERM):
            The estimator which will be fitted

    Raises
    ------
        ValueError:
            Data are complex

        ValueError:
            Data contains non finite value

        TypeError:
            Sparsed features are not CSR

        TypeError:
            Sparsed labels are not CSR

    Returns
    -------
        X (numpy array or scipy sparse CSR matrix):
            Converted features if required by the estimator.

        labels (numpy array or scipy sparse CSR matrix):
            Converted labels if required by the estimator.

        label_encoder (sklearn.LabelEncoder):
            Convert text labels if needed
    """
    label_encoder = None

    if np.iscomplexobj(X) or np.iscomplexobj(labels):
        raise ValueError("Complex data not supported")

    if not scipy.sparse.issparse(X) and not scipy.sparse.issparse(labels):
        x_element = get_element(X)
        if type(x_element) not in (np.float32, np.float64):

            logger.info("The features have been converted in float64")
            X = X.astype('float64')

        labels, label_encoder = check_labels(labels, estimator)

        _assert_all_finite(X)

    else:
        if scipy.sparse.issparse(X):
            raise TypeError("The library does not supports sparse data.")

        X, labels = windows_conversion(X, labels)

    return X, labels, label_encoder


def windows_conversion(X, labels):

    if platform.system() == "Windows":
        if scipy.sparse.issparse(X):
            X.indptr = X.indptr.astype(np.float64).astype(np.intc)
            X.indices = X.indices.astype(np.float64).astype(np.intc)
        if scipy.sparse.issparse(labels):
            labels.indptr = labels.indptr.astype(np.float64).astype(np.intc)
            labels.indices = labels.indices.astype(np.float64).astype(np.intc)

    return X, labels


[docs]def check_positive_parameter(parameter, message):
    """
    Check that a parameter if a number and positive.

    Parameters
    ----------
        parameter (Any):
            Parameter to verify

        message (string):
            Message of the exception

    Raises
    ------
        ValueError:
            Parameter is not a number

        ValueError:
            Parameter is not positive
    """
    if not isinstance(parameter, numbers.Number):
        raise ValueError(message)

    if isinstance(parameter, numbers.Number) and parameter < 0:
        raise ValueError(message)


[docs]def check_parameters(estimator):
    """
    Verify that the different parameters of an estimator respect the constraints.

    Parameters
    ----------
        estimator (ERM):
            Estimator to veriffy
    """
    check_positive_parameter(
        estimator.tol, "Tolerance for stopping criteria must be positive")
    check_positive_parameter(estimator.max_iter,
                             "Maximum number of iteration must be positive")
    check_positive_parameter(estimator.lambda_1,
                             "Penalty term must be positive")

    instance_class = estimator.__class__
    instance = instance_class()

    # Verify that it is not the default value
    if (estimator.penalty is None or estimator.penalty == "none") and estimator.lambda_1 != instance.lambda_1:
        warnings.warn("Setting penalty='none' will ignore the lambda_1")


def convert_to_array(X, labels):
    if not scipy.sparse.issparse(X) and not scipy.sparse.issparse(labels):
        if not isinstance(X, np.ndarray):
            X = np.array(X)
        if not isinstance(labels, np.ndarray):
            labels = np.array(labels)

    return X, labels


[docs]def check_input_fit(X, labels, estimator):
    """
    Check the different input arrays required for training according to the estimator type.

    Can convert data if necessary.

    Parameters
    ----------
        X (numpy array or scipy sparse CSR matrix):
            Numpy array containing features

        labels (numpy array or scipy sparse CSR matrix):
            Numpy array containing labels

        estimator (ERM):
            The estimator which will be fitted

    Raises
    ------
        ValueError:
            There is only one feature.

        ValueError:
            There is no sample.

        ValueError:
            An observation has no label.

        ValueError:
            Feature array has no feature

        ValueError:
            Features and labels does not have the same number of observations.

        ValueError:
            There is only one sample.

    Returns
    -------
        X (numpy array or scipy sparse CSR matrix):
            Converted features if required by the estimator.

        labels (numpy array or scipy sparse CSR matrix):
            Converted labels if required by the estimator.

        label_encoder (sklearn.LabelEncoder):
            Convert text labels if needed
    """

    X, labels = convert_to_array(X, labels)

    if X.ndim == 1:
        raise ValueError("The training array has only one dimension.")

    if X.shape[0] == 0:
        raise ValueError("Empty training array")

    if labels is None or True in np.array(np.equal(labels, None)):
        raise ValueError("y should be a 1d array")

    if len(X.shape) > 1 and X.shape[1] == 0:
        raise ValueError("0 feature(s) (shape=(" + str(X.shape[0]) + ", 0)) while a minimum of "
              + str(X.shape[0]) + " is required.")

    if labels.shape[0] != X.shape[0]:
        raise ValueError(
            "X and labels should have the same number of observations")

    if X.shape[0] == 1:
        raise ValueError("There should have more than 1 sample")

    if not estimator._get_tags()["multioutput"] and \
       not estimator._get_tags()["multioutput_only"] and labels.ndim > 1:
        warnings.warn(
            "A column-vector y was passed when a 1d array was expected", DataConversionWarning)

    X, labels, label_encoder = check_input_type(X, labels, estimator)
    check_parameters(estimator)

    return X, labels, label_encoder


[docs]def check_input_inference(X, estimator):
    """
    Check the format of the array which will be used for inference. Input array can be converted.

    Parameters
    ----------
        X (numpy array or scipy sparse CSR matrix):
            Array which will be used for inference

        estimator (ERM):
            Estimator which will be used

    Raises
    ------
        ValueError:
            One of the value is not finite

        ValueError:
            Shape of features is not correct

        ValueError:
            Shape of features does not correspond to estimators shape

    Returns
    -------
        X (numpy array or scipy sparse CSR matrix):
            Potentially converted array (if converted as numpy.float64)

    """
    if not scipy.sparse.issparse(X):
        if not isinstance(X, np.ndarray):
            X = np.array(X)
        x_element = get_element(X)
        if type(x_element) not in (np.float32, np.float64):
            X = X.astype("float64")

        _assert_all_finite(X)

    if X.ndim == 1:
        raise ValueError("Reshape your data")

    if X.shape[1] != estimator.n_features_in_:
        raise ValueError(f"X has 1 features, but estimator is expecting {estimator.n_features_in_} features as input")

    return X