"""Contain the functions concerning the processing of data."""
import warnings
import numbers
import platform
import numpy as np
import scipy.sparse
from scipy.sparse import issparse
from scipy.sparse import dok_matrix
from scipy.sparse import lil_matrix
from sklearn.exceptions import DataConversionWarning
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import check_array, _assert_all_finite
from sklearn.preprocessing import normalize as skNormalize
from collections.abc import Sequence
from cyanure_pytorch.logger import setup_custom_logger
logger = setup_custom_logger("INFO")
[docs]def preprocess(X, centering=False, normalize=True, columns=True):
Preprocess features training data.
Perform in-place centering or normalization, either of columns or rows
of the input matrix X.
X (numpy array or scipy sparse CSR matrix):
Input matrix
centering (boolean) : default=False
Perform a centering operation
normalize (boolean): default=True
training_data_fortran = np.asfortranarray(X)
if scipy.sparse.issparse(X):
raise TypeError("The library does not supports sparse data.")
if columns:
if centering:
column_means = np.mean(training_data_fortran, axis=0, keepdims=True)
training_data_fortran = training_data_fortran - column_means
if normalize:
training_data_fortran = skNormalize(training_data_fortran, axis=0, norm="l2")
if centering:
row_means = np.mean(training_data_fortran, axis=1, keepdims=True)
training_data_fortran = training_data_fortran - row_means
if normalize:
training_data_fortran = skNormalize(training_data_fortran, axis=1, norm="l2")
return training_data_fortran
def sklearn_catch_warnings(y, check_y_kwargs):
with warnings.catch_warnings():
if not issparse(y):
y = check_array(y, dtype=None, **check_y_kwargs)
except (np.VisibleDeprecationWarning, ValueError) as e:
if str(e).startswith("Complex data not supported"):
# dtype=object should be provided explicitly for ragged arrays,
# see NEP 34
y = check_array(y, dtype=object, **check_y_kwargs)
def sklearn_check_invalid_inputs(y):
# Invalid inputs
if y.ndim not in (1, 2):
# Number of dimension greater than 2: [[[1, 2]]]
return "unknown"
if not min(y.shape):
# Empty ndarray: []/[[]]
if y.ndim == 1:
# 1-D empty array: []
return "binary" # []
# 2-D empty array: [[]]
return "unknown"
if not issparse(y) and y.dtype == object and not isinstance(y.flat[0], str):
# [obj_1] and not ["label_1"]
return "unknown"
def sklearn_check_old_format(y):
# The old sequence of sequences format
if (
not hasattr(y[0], "__array__")
and isinstance(y[0], Sequence)
and not isinstance(y[0], str)
raise ValueError(
"Sequence of sequences are not"
" supported; use a binary array or sparse"
" matrix instead."
except IndexError:
# Code from scikit-learn
def type_of_target(y, input_name=""):
"""Determine the type of data indicated by the target.
Note that this type is the most specific type that can be inferred.
For example:
* ``binary`` is more specific but compatible with ``multiclass``.
* ``multiclass`` of integers is more specific but compatible with
* ``multilabel-indicator`` is more specific but compatible with
y : {array-like, sparse matrix}
Target values. If a sparse matrix, `y` is expected to be a
CSR/CSC matrix.
input_name: str, default=""
The data name used to construct the error message.
.. versionadded:: 1.1.0
target_type: str
One of:
* 'continuous': `y` is an array-like of floats that are not all
integers, and is 1d or a column vector.
* 'continuous-multioutput': `y` is a 2d array of floats that are
not all integers, and both dimensions are of size > 1.
* 'binary': `y` contains <= 2 discrete values and is 1d or a column
* 'multiclass': `y` contains more than two discrete values, is not a
sequence of sequences, and is 1d or a column vector.
* 'multiclass-multioutput': `y` is a 2d array that contains more
than two discrete values, is not a sequence of sequences, and both
dimensions are of size > 1.
* 'multilabel-indicator': `y` is a label indicator matrix, an array
of two dimensions with at least two columns, and at most 2 unique
* 'unknown': `y` is array-like but none of the above, such as a 3d
array, sequence of sequences, or an array of non-sequence objects.
>>> from sklearn.utils.multiclass import type_of_target
>>> import numpy as np
>>> type_of_target([0.1, 0.6])
>>> type_of_target([1, -1, -1, 1])
>>> type_of_target(['a', 'b', 'a'])
>>> type_of_target([1.0, 2.0])
>>> type_of_target([1, 0, 2])
>>> type_of_target([1.0, 0.0, 3.0])
>>> type_of_target(['a', 'b', 'c'])
>>> type_of_target(np.array([[1, 2], [3, 1]]))
>>> type_of_target([[1, 2]])
>>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))
>>> type_of_target(np.array([[0, 1], [1, 1]]))
valid = (
(isinstance(y, Sequence) or issparse(y) or hasattr(y, "__array__"))
and not isinstance(y, str)
if not valid:
raise ValueError(
"Expected array-like (array or non-string sequence), got %r" % y
sparse_pandas = y.__class__.__name__ in ["SparseSeries", "SparseArray"]
if sparse_pandas:
raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")
if is_multilabel(y):
return "multilabel-indicator"
# DeprecationWarning will be replaced by ValueError, see NEP 34
# https://numpy.org/neps/nep-0034-infer-dtype-is-object.html
# We therefore catch both deprecation (NumPy < 1.24) warning and
# value error (NumPy >= 1.24).
check_y_kwargs = dict(
sklearn_catch_warnings(y, check_y_kwargs)
# Check if multioutput
if y.ndim == 2 and y.shape[1] > 1:
suffix = "-multioutput" # [[1, 2], [1, 2]]
suffix = "" # [1, 2, 3] or [[1], [2], [3]]
# Check float and contains non-integer float values
if y.dtype.kind == "f":
# [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
data = y.data if issparse(y) else y
if np.any(data != np.floor(data)):
return "continuous" + suffix
# Check multiclass
first_row = y[0] if not issparse(y) else y.getrow(0).data
if np.unique(y).shape[0] > 2 or (y.ndim == 2 and len(first_row) > 1):
# [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
return "multiclass" + suffix
return "binary" # [1, 2] or [["a"], ["b"]]
# Code from scikit-learn
def is_multilabel(y):
"""Check if ``y`` is in a multilabel format.
y : ndarray of shape (n_samples,)
Target values.
out: bool
Return ``True``, if ``y`` is in a multilabel format, else ```False``.
>>> import numpy as np
>>> from sklearn.utils.multiclass import is_multilabel
>>> is_multilabel([0, 1, 0, 1])
>>> is_multilabel([[1], [0, 2], []])
>>> is_multilabel(np.array([[1, 0], [0, 0]]))
>>> is_multilabel(np.array([[1], [0], [0]]))
>>> is_multilabel(np.array([[1, 0, 0]]))
if hasattr(y, "__array__") or isinstance(y, Sequence):
# DeprecationWarning will be replaced by ValueError, see NEP 34
# https://numpy.org/neps/nep-0034-infer-dtype-is-object.html
check_y_kwargs = dict(
sklearn_catch_warnings(y, check_y_kwargs)
if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):
return False
if issparse(y):
if isinstance(y, (dok_matrix, lil_matrix)):
y = y.tocsr()
labels = np.unique(y.data)
return (
len(y.data) == 0
or (labels.size == 1 or (labels.size == 2) and (0 in labels))
and (np.dtype.kind in "biu" or _is_integral_float(labels)) # bool, int, uint
labels = np.unique(y)
return len(labels) < 3 and (
y.dtype.kind in "biu" or _is_integral_float(labels) # bool, int, uint
# Code from scikit-learn
def _is_integral_float(y):
return y.dtype.kind == "f" and np.all(y.astype(int) == y)
[docs]def check_labels(labels, estimator):
Verify the format of labels depending on the type of the estimator.
Can convert labels in some cases.
labels (numpy array or scipy sparse CSR matrix):
Numpy array containing labels
estimator (ERM):
The estimator which will be fitted
Format of the labels does not respect the format supported by Cyanure classifiers.
Labels have an non finite value
Problem has only one class
labels (numpy array or scipy sparse CSR matrix):
Converted labels if required by the estimator.
label_encoder (sklearn.LabelEncoder):
Convert text labels if needed
label_encoder = None
if estimator.__sklearn_tags__().estimator_type == "classifier":
if np.issubdtype(type(labels[0]), np.str_):
label_encoder = LabelEncoder()
labels = label_encoder.transform(labels)
y_type = type_of_target(labels)
if y_type not in [
raise ValueError("Unknown label type: %r" % y_type)
if type(labels[0]) not in (np.float32, np.float64):
logger.info("The labels have been converted to float64")
labels = labels.astype('float64')
if estimator.__sklearn_tags__().estimator_type == "classifier" and len(np.unique(labels)) == 1:
raise ValueError("Classifier can't train when only one class is present.")
return labels, label_encoder
def check_is_finite(array_to_test):
if not np.any(np.isfinite(array_to_test)):
raise ValueError(
"Input contains NaN, infinity or a value too large for dtype('float64').")
def get_element(array):
Get an element from an array of any depth.
array (Type of the element):
Array we want to get an element
Type of the element:
One of the element of the array
element = array[0]
for i in range(len(array.shape) - 1):
element = element[i]
return element
def windows_conversion(X, labels):
if platform.system() == "Windows":
if scipy.sparse.issparse(X):
X.indptr = X.indptr.astype(np.float64).astype(np.intc)
X.indices = X.indices.astype(np.float64).astype(np.intc)
if scipy.sparse.issparse(labels):
labels.indptr = labels.indptr.astype(np.float64).astype(np.intc)
labels.indices = labels.indices.astype(np.float64).astype(np.intc)
return X, labels
[docs]def check_positive_parameter(parameter, message):
Check that a parameter if a number and positive.
parameter (Any):
Parameter to verify
message (string):
Message of the exception
Parameter is not a number
Parameter is not positive
if not isinstance(parameter, numbers.Number):
raise ValueError(message)
if isinstance(parameter, numbers.Number) and parameter < 0:
raise ValueError(message)
[docs]def check_parameters(estimator):
Verify that the different parameters of an estimator respect the constraints.
estimator (ERM):
Estimator to veriffy
estimator.tol, "Tolerance for stopping criteria must be positive")
"Maximum number of iteration must be positive")
"Penalty term must be positive")
instance_class = estimator.__class__
instance = instance_class()
# Verify that it is not the default value
if (estimator.penalty is None or estimator.penalty == "none") and estimator.lambda_1 != instance.lambda_1:
warnings.warn("Setting penalty='none' will ignore the lambda_1")
def convert_to_array(X, labels):
if not scipy.sparse.issparse(X) and not scipy.sparse.issparse(labels):
if not isinstance(X, np.ndarray):
X = np.array(X)
if not isinstance(labels, np.ndarray):
labels = np.array(labels)
return X, labels