DirichletMultinomialMixture

`noloox.mixture.DirichletMultinomialMixture`

Bases: BaseEstimator, ClusterMixin, DensityMixin

Implementation of the Dirichlet Multinomial Mixture Model with Gibbs Sampling solver

Parameters:

Name	Type	Description	Default
`n_components`	`int`	Number of mixture components in the model.	required
`n_iter`	`int`	Number of iterations during fitting. If you find your results are unsatisfactory, increase this number.	`50`
`alpha`	`float`	Willingness of a document joining an empty cluster.	`0.1`
`beta`	`float`	Willingness to join clusters, where the terms in the document are not present.	`0.1`
`random_state`	`Optional[int]`	Random seed to use for reproducibility.	`None`

Attributes:

Name	Type	Description
`components_`	`array of shape (n_components, n_vocab)`	Describes all components of the topic distribution. Contains the amount each word has been assigned to each component during fitting.
`n_features_in_`	`int`	Number of total vocabulary items seen during fitting.

Source code in noloox/mixture/dmm.py

class DirichletMultinomialMixture(BaseEstimator, ClusterMixin, DensityMixin):
    """Implementation of the Dirichlet Multinomial Mixture Model with Gibbs Sampling
    solver

    Parameters
    ----------
    n_components: int
        Number of mixture components in the model.
    n_iter: int, default 50
        Number of iterations during fitting.
        If you find your results are unsatisfactory, increase this number.
    alpha: float, default 0.1
        Willingness of a document joining an empty cluster.
    beta: float, default 0.1
        Willingness to join clusters, where the terms in the document
        are not present.
    random_state: int, default None
        Random seed to use for reproducibility.

    Attributes
    ----------
    components_: array of shape (n_components, n_vocab)
        Describes all components of the topic distribution.
        Contains the amount each word has been assigned to each component
        during fitting.
    n_features_in_: int
        Number of total vocabulary items seen during fitting.
    """

    def __init__(
        self,
        n_components: int,
        n_iter: int = 50,
        alpha: float = 0.1,
        beta: float = 0.1,
        random_state: Optional[int] = None,
    ):
        super().__init__()
        self.n_components = n_components
        self.n_iter = n_iter
        self.alpha = alpha
        self.beta = beta
        self.random_state = random_state

    def get_params(self, deep: bool = False) -> dict:
        """Get parameters for this estimator.

        Parameters
        ----------
        deep: bool, default False
            Ignored, exists for sklearn compatibility.

        Returns
        -------
        dict
            Parameter names mapped to their values.

        Note
        ----
        Exists for sklearn compatibility.
        """
        return {
            "n_components": self.n_components,
            "n_iter": self.n_iter,
            "alpha": self.alpha,
            "beta": self.beta,
        }

    def fit_predict(self, X, y=None):
        """Fits the model using Gibbs Sampling. Detailed description of the
        algorithm in Yin and Wang (2014).

        Parameters
        ----------
        X: array-like of shape (n_samples, n_features)
            BOW matrix of corpus.
        y: None
            Ignored, exists for sklearn compatibility.

        Returns
        -------
        DirichletMultinomialMixture
            The fitted model.

        """
        if issparse(X):
            warnings.warn(
                "Sparse arrays are not yet supported. Implicitly converting to dense array."
            )
            X = np.asarray(X.todense())
        if self.random_state is not None:
            random_key = jax.random.key(self.random_state)
        else:
            random_key = jax.random.key(random.randint(0, 1000))
        random_key, self.components_, self.labels_, self.m_z, self.n_z = fit_model(
            random_key,
            self.n_components,
            self.n_iter,
            self.alpha,
            self.beta,
            X,
        )
        self.weights_ = np.asarray(self.m_z) / np.sum(self.m_z)
        self.components_ = np.asarray(self.components_)
        D, V = X.shape
        self._predict_proba = jax.vmap(
            lambda x: softmax(
                log_cond_prob(
                    self.m_z,
                    self.components_,
                    self.n_z,
                    x,
                    D,
                    self.n_components,
                    V,
                    self.alpha,
                    self.beta,
                )
            ),
        )

        return self.labels_

    def predict_proba(self, X) -> np.ndarray:
        """Predicts probabilities for each document belonging to each
        component.

        Parameters
        ----------
        X: array-like  of shape (n_samples, n_features)
            Document-term matrix.

        Returns
        -------
        array of shape (n_samples, n_components)
            Probabilities for each document belonging to each cluster.

        Raises
        ------
        NotFittedException
            If the model is not fitted, an exception will be raised
        """
        if not hasattr(self, "_predict_proba"):
            raise NotFittedError("Model not fitted yet, can't predict probabilities.")
        if issparse(X):
            warnings.warn(
                "Sparse arrays are not yet supported. Implicitly converting to dense array."
            )
            X = np.asarray(X.todense())
        p = self._predict_proba(X)
        return np.asarray(p)

    def fit(self, X, y=None):
        self.fit_predict(X, y)
        return self

    def transform(self, X) -> np.ndarray:
        """Alias for predict_proba()."""
        return self.predict_proba(X)

    def predict(self, X) -> np.ndarray:
        """Predicts cluster labels for a set of documents. Mainly exists for
        compatibility with density estimators in sklearn.

        Parameters
        ----------
        X: array-like  of shape (n_samples, n_features)
            Document-term matrix.

        Returns
        -------
        array of shape (n_samples,)
            Cluster label for each document.

        Raises
        ------
        NotFittedException
            If the model is not fitted, an exception will be raised
        """
        return np.argmax(self.predict_proba(X), axis=1)

    def fit_transform(
        self,
        X,
        y=None,
    ) -> np.ndarray:
        """Fits the model, then transforms the given data.

        Parameters
        ----------
        X: array-like of shape (n_samples, n_features)
            Document-term matrix.
        y: None
            Ignored, sklearn compatibility.

        Returns
        -------
        array of shape (n_samples, n_components)
            Probabilities for each document belonging to each cluster.
        """
        return self.fit(X).transform(X)

`fit_predict(X, y=None)`

Fits the model using Gibbs Sampling. Detailed description of the algorithm in Yin and Wang (2014).

Parameters:

Name	Type	Description	Default
`X`		BOW matrix of corpus.	required
`y`		Ignored, exists for sklearn compatibility.	`None`

Returns:

Type	Description
`DirichletMultinomialMixture`	The fitted model.

Source code in noloox/mixture/dmm.py

def fit_predict(self, X, y=None):
    """Fits the model using Gibbs Sampling. Detailed description of the
    algorithm in Yin and Wang (2014).

    Parameters
    ----------
    X: array-like of shape (n_samples, n_features)
        BOW matrix of corpus.
    y: None
        Ignored, exists for sklearn compatibility.

    Returns
    -------
    DirichletMultinomialMixture
        The fitted model.

    """
    if issparse(X):
        warnings.warn(
            "Sparse arrays are not yet supported. Implicitly converting to dense array."
        )
        X = np.asarray(X.todense())
    if self.random_state is not None:
        random_key = jax.random.key(self.random_state)
    else:
        random_key = jax.random.key(random.randint(0, 1000))
    random_key, self.components_, self.labels_, self.m_z, self.n_z = fit_model(
        random_key,
        self.n_components,
        self.n_iter,
        self.alpha,
        self.beta,
        X,
    )
    self.weights_ = np.asarray(self.m_z) / np.sum(self.m_z)
    self.components_ = np.asarray(self.components_)
    D, V = X.shape
    self._predict_proba = jax.vmap(
        lambda x: softmax(
            log_cond_prob(
                self.m_z,
                self.components_,
                self.n_z,
                x,
                D,
                self.n_components,
                V,
                self.alpha,
                self.beta,
            )
        ),
    )

    return self.labels_

`fit_transform(X, y=None)`

Fits the model, then transforms the given data.

Parameters:

Name	Type	Description	Default
`X`		Document-term matrix.	required
`y`		Ignored, sklearn compatibility.	`None`

Returns:

Type	Description
`array of shape (n_samples, n_components)`	Probabilities for each document belonging to each cluster.

Source code in noloox/mixture/dmm.py

def fit_transform(
    self,
    X,
    y=None,
) -> np.ndarray:
    """Fits the model, then transforms the given data.

    Parameters
    ----------
    X: array-like of shape (n_samples, n_features)
        Document-term matrix.
    y: None
        Ignored, sklearn compatibility.

    Returns
    -------
    array of shape (n_samples, n_components)
        Probabilities for each document belonging to each cluster.
    """
    return self.fit(X).transform(X)

`get_params(deep=False)`

Get parameters for this estimator.

Parameters:

Name	Type	Description	Default
`deep`	`bool`	Ignored, exists for sklearn compatibility.	`False`

Returns:

Type	Description
`dict`	Parameter names mapped to their values.

Note

Exists for sklearn compatibility.

Source code in noloox/mixture/dmm.py

def get_params(self, deep: bool = False) -> dict:
    """Get parameters for this estimator.

    Parameters
    ----------
    deep: bool, default False
        Ignored, exists for sklearn compatibility.

    Returns
    -------
    dict
        Parameter names mapped to their values.

    Note
    ----
    Exists for sklearn compatibility.
    """
    return {
        "n_components": self.n_components,
        "n_iter": self.n_iter,
        "alpha": self.alpha,
        "beta": self.beta,
    }

`predict(X)`

Predicts cluster labels for a set of documents. Mainly exists for compatibility with density estimators in sklearn.

Parameters:

Name	Type	Description	Default
`X`		Document-term matrix.	required

Returns:

Type	Description
`array of shape (n_samples,)`	Cluster label for each document.

Raises:

Type	Description
`NotFittedException`	If the model is not fitted, an exception will be raised

Source code in noloox/mixture/dmm.py

def predict(self, X) -> np.ndarray:
    """Predicts cluster labels for a set of documents. Mainly exists for
    compatibility with density estimators in sklearn.

    Parameters
    ----------
    X: array-like  of shape (n_samples, n_features)
        Document-term matrix.

    Returns
    -------
    array of shape (n_samples,)
        Cluster label for each document.

    Raises
    ------
    NotFittedException
        If the model is not fitted, an exception will be raised
    """
    return np.argmax(self.predict_proba(X), axis=1)

`predict_proba(X)`

Predicts probabilities for each document belonging to each component.

Parameters:

Name	Type	Description	Default
`X`		Document-term matrix.	required

Returns:

Type	Description
`array of shape (n_samples, n_components)`	Probabilities for each document belonging to each cluster.

Raises:

Type	Description
`NotFittedException`	If the model is not fitted, an exception will be raised

Source code in noloox/mixture/dmm.py

def predict_proba(self, X) -> np.ndarray:
    """Predicts probabilities for each document belonging to each
    component.

    Parameters
    ----------
    X: array-like  of shape (n_samples, n_features)
        Document-term matrix.

    Returns
    -------
    array of shape (n_samples, n_components)
        Probabilities for each document belonging to each cluster.

    Raises
    ------
    NotFittedException
        If the model is not fitted, an exception will be raised
    """
    if not hasattr(self, "_predict_proba"):
        raise NotFittedError("Model not fitted yet, can't predict probabilities.")
    if issparse(X):
        warnings.warn(
            "Sparse arrays are not yet supported. Implicitly converting to dense array."
        )
        X = np.asarray(X.todense())
    p = self._predict_proba(X)
    return np.asarray(p)

`transform(X)`

Alias for predict_proba().

Source code in noloox/mixture/dmm.py

def transform(self, X) -> np.ndarray:
    """Alias for predict_proba()."""
    return self.predict_proba(X)