Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/v0.11.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ Bug fixes
`bool` and `pd.category` by delegating the conversion to scikit-learn encoder.
:pr:`1002` by :user:`Guillaume Lemaitre <glemaitre>`.

- Handle sparse matrices in :class:`~imblearn.over_sampling.SMOTEN` and raise a warning
since it requires a conversion to dense matrices.
:pr:`1003` by :user:`Guillaume Lemaitre <glemaitre>`.

Compatibility
.............

Expand Down
21 changes: 19 additions & 2 deletions imblearn/over_sampling/_smote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import numpy as np
from scipy import sparse
from sklearn.base import clone
from sklearn.exceptions import DataConversionWarning
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.utils import _safe_indexing, check_array, check_random_state
from sklearn.utils.sparsefuncs_fast import (
Expand Down Expand Up @@ -893,7 +894,7 @@ def _check_X_y(self, X, y):
y,
reset=True,
dtype=None,
accept_sparse=False,
accept_sparse=["csr", "csc"],
)
return X, y, binarize_y

Expand Down Expand Up @@ -927,6 +928,17 @@ def _fit_resample(self, X, y):
FutureWarning,
)

if sparse.issparse(X):
X_sparse_format = X.format
X = X.toarray()
warnings.warn(
"Passing a sparse matrix to SMOTEN is not really efficient since it is"
" converted to a dense array internally.",
DataConversionWarning,
)
else:
X_sparse_format = None

self._validate_estimator()

X_resampled = [X.copy()]
Expand Down Expand Up @@ -964,7 +976,12 @@ def _fit_resample(self, X, y):
X_resampled = np.vstack(X_resampled)
y_resampled = np.hstack(y_resampled)

return X_resampled, y_resampled
if X_sparse_format == "csr":
return sparse.csr_matrix(X_resampled), y_resampled
elif X_sparse_format == "csc":
return sparse.csc_matrix(X_resampled), y_resampled
else:
return X_resampled, y_resampled

def _more_tags(self):
return {"X_types": ["2darray", "dataframe", "string"]}
22 changes: 21 additions & 1 deletion imblearn/over_sampling/_smote/tests/test_smoten.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import numpy as np
import pytest
from sklearn.preprocessing import OrdinalEncoder
from sklearn.exceptions import DataConversionWarning
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.utils._testing import _convert_container

from imblearn.over_sampling import SMOTEN

Expand Down Expand Up @@ -56,6 +58,24 @@ def test_smoten_resampling():
np.testing.assert_array_equal(y_generated, "not apple")


@pytest.mark.parametrize("sparse_format", ["sparse_csr", "sparse_csc"])
def test_smoten_sparse_input(data, sparse_format):
"""Check that we handle sparse input in SMOTEN even if it is not efficient.

Non-regression test for:
https://github.com/scikit-learn-contrib/imbalanced-learn/issues/971
"""
X, y = data
X = OneHotEncoder().fit_transform(X)
X = _convert_container(X, sparse_format)

with pytest.warns(DataConversionWarning, match="is not really efficient"):
X_res, y_res = SMOTEN(random_state=0).fit_resample(X, y)

assert X_res.format == X.format
assert X_res.shape[0] == len(y_res)


def test_smoten_categorical_encoder(data):
"""Check that `categorical_encoder` is used when provided."""

Expand Down