Peax

noloox.cluster.Peax

Bases: ClusterMixin, BaseEstimator

Peax clustering model. The model estimates the number of clusters from density peaks, then uses Gaussian Mixtures with fixed means to estimate cluster probabilities.

Parameters:

Name Type Description Default
random_state Optional[int]

Random seed to use for fitting gaussian mixture to peaks.

None

Attributes:

Name Type Description
labels_ ndarray of shape (n_samples,)

Cluster labels for each point in X.

gmm_ FixedMeanGaussianMixture

The fitted Gaussian mixture model with fixed means.

means_ ndarray of shape (n_components, 2)

Coordinates of detected density peaks used as cluster means.

weights_ ndarray of shape (n_components,)

Final mixture component weights after refitting.

classes_ ndarray of shape (n_components,)

Sorted array of unique cluster labels.

density gaussian_kde

Kernel density estimator fitted to the input data.

Source code in noloox/cluster/peax.py
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
class Peax(ClusterMixin, BaseEstimator):
    """Peax clustering model.
    The model estimates the number of clusters from density peaks,
    then uses Gaussian Mixtures with fixed means to estimate cluster
    probabilities.

    Parameters
    ----------
    random_state: int, default None
        Random seed to use for fitting gaussian mixture to peaks.

    Attributes
    ----------
    labels_ : ndarray of shape (n_samples,)
        Cluster labels for each point in `X`.
    gmm_ : FixedMeanGaussianMixture
        The fitted Gaussian mixture model with fixed means.
    means_ : ndarray of shape (n_components, 2)
        Coordinates of detected density peaks used as cluster means.
    weights_ : ndarray of shape (n_components,)
        Final mixture component weights after refitting.
    classes_ : ndarray of shape (n_components,)
        Sorted array of unique cluster labels.
    density : gaussian_kde
        Kernel density estimator fitted to the input data.
    """

    def __init__(self, random_state: Optional[int] = None):
        self.random_state = random_state

    def fit_predict(self, X, y=None):
        """Fit Peax clustering model and cluster datapoints.

        Parameters
        ----------
        X: array-like of shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row corresponds to a single data point.

        y: Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        labels: ndarray of shape (n_samples,)
            Cluster labels for each datapoint.
        """
        if X.shape[1] > 2:
            raise ValueError(
                f"X has {X.shape[1]} > 2 features. Peax only accepts 2D data."
            )
        self.X_range = np.min(X), np.max(X)
        self.density = gaussian_kde(X.T, "scott")
        coord = np.linspace(*self.X_range, num=100)
        z = []
        for yval in coord:
            points = np.stack([coord, np.full(coord.shape, yval)]).T
            prob = np.exp(self.density.logpdf(points.T))
            z.append(prob)
        z = np.stack(z)
        peaks = detect_peaks(z.T)
        peak_ind = np.nonzero(peaks)
        peak_pos = np.stack([coord[peak_ind[0]], coord[peak_ind[1]]]).T
        weights = self.density.pdf(peak_pos.T)
        weights = weights / weights.sum()
        self.gmm_ = FixedMeanGaussianMixture(
            peak_pos.shape[0],
            means_init=peak_pos,
            weights_init=weights,
            random_state=self.random_state,
        )
        self.labels_ = self.gmm_.fit_predict(X)
        # Checking whether there are close to zero components
        is_zero = np.isclose(self.gmm_.weights_, 0)
        n_zero = np.sum(is_zero)
        if n_zero > 0:
            print(f"{n_zero} components have zero weight, removing them and refitting.")
        peak_pos = peak_pos[~is_zero]
        weights = self.gmm_.weights_[~is_zero]
        weights = weights / weights.sum()
        self.gmm_ = FixedMeanGaussianMixture(
            peak_pos.shape[0],
            means_init=peak_pos,
            weights_init=weights,
            random_state=self.random_state,
        )
        self.labels_ = self.gmm_.fit_predict(X)
        self.classes_ = np.sort(np.unique(self.labels_))
        self.means_ = self.gmm_.means_
        self.weights_ = self.gmm_.weights_
        return self.labels_

    def fit(self, X, y=None):
        """Fits clustering model to data.

        Parameters
        ----------
        X: array-like of shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row corresponds to a single data point.

        y: Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        self: Peax
            Fitted clustering model.
        """
        self.fit_predict(X, y)
        return self

    @property
    def n_components(self) -> int:
        """Number of clusters found in the data."""
        return self.gmm_.n_components

    def predict_proba(self, X):
        """Evaluate the components' density for each sample.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.

        Returns
        -------
        resp : array, shape (n_samples, n_components)
            Density of each Gaussian component for each sample in X.
        """
        return self.gmm_.predict_proba(X)

    def score_samples(self, X):
        return self.density.logpdf(X.T)

    def score(self, X):
        return np.mean(self.score_samples(X))

n_components: int property

Number of clusters found in the data.

fit(X, y=None)

Fits clustering model to data.

Parameters:

Name Type Description Default
X

List of n_features-dimensional data points. Each row corresponds to a single data point.

required
y

Not used, present for API consistency by convention.

None

Returns:

Name Type Description
self Peax

Fitted clustering model.

Source code in noloox/cluster/peax.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def fit(self, X, y=None):
    """Fits clustering model to data.

    Parameters
    ----------
    X: array-like of shape (n_samples, n_features)
        List of n_features-dimensional data points. Each row corresponds to a single data point.

    y: Ignored
        Not used, present for API consistency by convention.

    Returns
    -------
    self: Peax
        Fitted clustering model.
    """
    self.fit_predict(X, y)
    return self

fit_predict(X, y=None)

Fit Peax clustering model and cluster datapoints.

Parameters:

Name Type Description Default
X

List of n_features-dimensional data points. Each row corresponds to a single data point.

required
y

Not used, present for API consistency by convention.

None

Returns:

Name Type Description
labels ndarray of shape (n_samples,)

Cluster labels for each datapoint.

Source code in noloox/cluster/peax.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def fit_predict(self, X, y=None):
    """Fit Peax clustering model and cluster datapoints.

    Parameters
    ----------
    X: array-like of shape (n_samples, n_features)
        List of n_features-dimensional data points. Each row corresponds to a single data point.

    y: Ignored
        Not used, present for API consistency by convention.

    Returns
    -------
    labels: ndarray of shape (n_samples,)
        Cluster labels for each datapoint.
    """
    if X.shape[1] > 2:
        raise ValueError(
            f"X has {X.shape[1]} > 2 features. Peax only accepts 2D data."
        )
    self.X_range = np.min(X), np.max(X)
    self.density = gaussian_kde(X.T, "scott")
    coord = np.linspace(*self.X_range, num=100)
    z = []
    for yval in coord:
        points = np.stack([coord, np.full(coord.shape, yval)]).T
        prob = np.exp(self.density.logpdf(points.T))
        z.append(prob)
    z = np.stack(z)
    peaks = detect_peaks(z.T)
    peak_ind = np.nonzero(peaks)
    peak_pos = np.stack([coord[peak_ind[0]], coord[peak_ind[1]]]).T
    weights = self.density.pdf(peak_pos.T)
    weights = weights / weights.sum()
    self.gmm_ = FixedMeanGaussianMixture(
        peak_pos.shape[0],
        means_init=peak_pos,
        weights_init=weights,
        random_state=self.random_state,
    )
    self.labels_ = self.gmm_.fit_predict(X)
    # Checking whether there are close to zero components
    is_zero = np.isclose(self.gmm_.weights_, 0)
    n_zero = np.sum(is_zero)
    if n_zero > 0:
        print(f"{n_zero} components have zero weight, removing them and refitting.")
    peak_pos = peak_pos[~is_zero]
    weights = self.gmm_.weights_[~is_zero]
    weights = weights / weights.sum()
    self.gmm_ = FixedMeanGaussianMixture(
        peak_pos.shape[0],
        means_init=peak_pos,
        weights_init=weights,
        random_state=self.random_state,
    )
    self.labels_ = self.gmm_.fit_predict(X)
    self.classes_ = np.sort(np.unique(self.labels_))
    self.means_ = self.gmm_.means_
    self.weights_ = self.gmm_.weights_
    return self.labels_

predict_proba(X)

Evaluate the components' density for each sample.

Parameters:

Name Type Description Default
X array-like of shape (n_samples, n_features)

List of n_features-dimensional data points. Each row corresponds to a single data point.

required

Returns:

Name Type Description
resp (array, shape(n_samples, n_components))

Density of each Gaussian component for each sample in X.

Source code in noloox/cluster/peax.py
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
def predict_proba(self, X):
    """Evaluate the components' density for each sample.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        List of n_features-dimensional data points. Each row
        corresponds to a single data point.

    Returns
    -------
    resp : array, shape (n_samples, n_components)
        Density of each Gaussian component for each sample in X.
    """
    return self.gmm_.predict_proba(X)