Skip to content

Multimodal Topic Modelling (BETA)

Note

Multimodal modeling is still a BETA feature in Turftopic, and it is likely that we will add more features and change the interface in the near future.

Some corpora spread across multiple modalities. A good example of this would be news articles with images attached. Turftopic now supports multimodal modelling with a number of models.

Multimodal Encoders

In order for images to be usable in Turftopic, you will need an embedding model that can both encode texts and images. You can both use models that are supported in SentenceTransformers, or those that support the MTEB multimodal encoder interface.

Use a multimodal encoder model

from turftopic import KeyNMF

multimodal_keynmf = KeyNMF(10, encoder="clip-ViT-B-32")

Tip

You can find current state-of-the-art embedding models and their capabilities on the Massive Image Embedding Benchmark leaderboard.

pip install "mteb<2.0.0"
from turftopic import KeyNMF
import mteb

encoder = mteb.get_model("kakaobrain/align-base")

multimodal_keynmf = KeyNMF(10, encoder="clip-ViT-B-32")

Corpus Structure

Currently all documents have to have an image attached to them, and only one image. This is a limitation, and we will address it in the future. Images can both be represented as file paths or PIL.Image objects.

from PIL import Image

images: list[Image] = [Image.open("file_path/something.jpeg"), ...]
texts: list[str] = [...]

len(images) == len(texts)

Basic Usage

All multimodal models have a fit_multimodal()/fit_transform_multimodal() method, that you can use to discover topics in multimodal corpora.

Fit a multimodal model on a corpus

from turftopic import KeyNMF

model = KeyNMF(12, encoder="clip-ViT-B-32")
model.fit_multimodal(texts, images=images)
model.plot_topics_with_images()
from turftopic import SemanticSignalSeparation

model = SemanticSignalSeparation(12, encoder="clip-ViT-B-32")
model.fit_multimodal(texts, images=images)
model.plot_topics_with_images()
from turftopic import ClusteringTopicModel

# BERTopic-style
model = ClusteringTopicModel(encoder="clip-ViT-B-32", feature_importance="c-tf-idf")
# Top2Vec-style
model = ClusteringTopicModel(encoder="clip-ViT-B-32", feature_importance="centroid")
model.fit_multimodal(texts, images=images)
model.plot_topics_with_images()
from turftopic import GMM

model = GMM(12, encoder="clip-ViT-B-32")
model.fit_multimodal(texts, images=images)
model.plot_topics_with_images()
from turftopic import AutoEncodingTopicModel

# CombinedTM
model = AutoEncodingTopicModel(12, combined=True, encoder="clip-ViT-B-32")
# ZeroShotTM
model = AutoEncodingTopicModel(12, combined=False, encoder="clip-ViT-B-32")
model.fit_multimodal(texts, images=images)
model.plot_topics_with_images()

API reference

turftopic.multimodal.MultimodalModel

Base model for multimodal topic models.

Source code in turftopic/multimodal.py
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
class MultimodalModel:
    """Base model for multimodal topic models."""

    def encode_multimodal(
        self,
        sentences: list[str],
        images: list[ImageRepr],
    ) -> dict[str, np.ndarray]:
        """Produce multimodal embeddings of the documents passed to the model.

        Parameters
        ----------
        sentences: list[str]
            Textual documents to encode.
        images: list[ImageRepr]
            Corresponding images for each document.

        Returns
        -------
        MultimodalEmbeddings
            Text, image and joint document embeddings.

        """
        if len(sentences) != len(images):
            raise ValueError("Images and documents were not the same length.")
        if hasattr(self.encoder_, "get_text_embeddings"):
            text_embeddings = np.array(
                self.encoder_.get_text_embeddings(sentences)
            )
        else:
            text_embeddings = self.encoder_.encode(sentences)
        embedding_size = text_embeddings.shape[1]
        images = _load_images(images)
        if hasattr(self.encoder_, "get_image_embeddings"):
            image_embeddings = np.array(
                self.encoder_.get_image_embeddings(list(images))
            )
        else:
            image_embeddings = []
            for image in images:
                if image is not None:
                    image_embeddings.append(self.encoder_.encode(image))
                else:
                    image_embeddings.append(np.full(embedding_size, np.nan))
            image_embeddings = np.stack(image_embeddings)
            print(image_embeddings)
        if hasattr(self.encoder_, "get_fused_embeddings"):
            document_embeddings = np.array(
                self.encoder_.get_fused_embeddings(
                    texts=sentences,
                    images=list(images),
                )
            )
        else:
            document_embeddings = _naive_join_embeddings(
                text_embeddings, image_embeddings
            )

        return {
            "text_embeddings": text_embeddings,
            "image_embeddings": image_embeddings,
            "document_embeddings": document_embeddings,
        }

    @staticmethod
    def validate_embeddings(embeddings: Optional[MultimodalEmbeddings]):
        if embeddings is None:
            return
        try:
            document_embeddings = embeddings["document_embeddings"]
            image_embeddings = embeddings["image_embeddings"]
        except KeyError as e:
            raise TypeError(
                "embeddings do not contain document and image embeddings, can't be used for multimodal modelling."
            ) from e
        if document_embeddings.shape != image_embeddings.shape:
            raise ValueError(
                f"Shape mismatch between document_embeddings {document_embeddings.shape} and image_embeddings {image_embeddings.shape}"
            )

    def validate_encoder(self):
        if not hasattr(self.encoder_, "encode"):
            if not all(
                (
                    hasattr(self.encoder_, "get_text_embeddings"),
                    hasattr(self.encoder_, "get_image_embeddings"),
                ),
            ):
                raise TypeError(
                    "An encoder must either have an encode() method or a get_text_embeddings and get_image_embeddings method (optionally get_fused_embeddings)"
                )

    @abstractmethod
    def fit_transform_multimodal(
        self,
        raw_documents: list[str],
        images: list[ImageRepr],
        y=None,
        embeddings: Optional[MultimodalEmbeddings] = None,
    ) -> np.ndarray:
        """Fits topic model in a multimodal context and returns the document-topic matrix.

        Parameters
        ----------
        raw_documents: iterable of str
            Documents to fit the model on.
        images: list[ImageRepr]
            Images corresponding to each document.
        y: None
            Ignored, exists for sklearn compatibility.
        embeddings: MultimodalEmbeddings
            Precomputed multimodal embeddings.

        Returns
        -------
        ndarray of shape (n_documents, n_topics)
            Document-topic matrix.
        """
        pass

    def fit_multimodal(
        self,
        raw_documents: list[str],
        images: list[ImageRepr],
        y=None,
        embeddings: Optional[MultimodalEmbeddings] = None,
    ):
        """Fits topic model on a multimodal corpus.

        Parameters
        ----------
        raw_documents: iterable of str
            Documents to fit the model on.
        images: list[ImageRepr]
            Images corresponding to each document.
        y: None
            Ignored, exists for sklearn compatibility.
        embeddings: MultimodalEmbeddings
            Precomputed multimodal embeddings.

        Returns
        -------
        Self
            The fitted topic model
        """
        self.fit_transform_multimodal(raw_documents, images, y, embeddings)
        return self

    @staticmethod
    def collect_top_images(
        images: list[Image.Image],
        image_topic_matrix: np.ndarray,
        n_images: int = 20,
        negative: bool = False,
    ) -> list[list[Image.Image]]:
        top_images: list[list[Image.Image]] = []
        for image_topic_vector in image_topic_matrix.T:
            if negative:
                image_topic_vector = -image_topic_vector
            top_im_ind = np.argsort(-image_topic_vector)[:20]
            top_im = [images[i] for i in top_im_ind]
            top_images.append(top_im)
        return top_images

    @staticmethod
    def _image_grid(
        images: list[Image.Image],
        final_size=(1200, 1200),
        grid_size: tuple[int, int] = (4, 4),
    ):
        grid_img = Image.new("RGB", final_size, (255, 255, 255))
        cell_width = final_size[0] // grid_size[0]
        cell_height = final_size[1] // grid_size[1]
        n_rows, n_cols = grid_size
        for idx, img in enumerate(images[: n_rows * n_cols]):
            img = img.resize(
                (cell_width, cell_height), resample=Image.Resampling.LANCZOS
            )
            x_offset = (idx % grid_size[0]) * cell_width
            y_offset = (idx // grid_size[1]) * cell_height
            grid_img.paste(img, (x_offset, y_offset))
        return grid_img

    def plot_topics_with_images(self, n_cols: int = 3, grid_size: int = 4):
        """Plots the most important images for each topic, along with keywords.

        Note that you will need to `pip install plotly` to use plots in Turftopic.

        Parameters
        ----------
        n_cols: int, default 3
            Number of columns you want to have in the grid of topics.
        grid_size: int, default 4
            The square root of the number of images you want to display for a given topic.
            For instance if grid_size==4, all topics will have 16 images displayed,
            since the joint image will have 4 columns and 4 rows.

        Returns
        -------
        go.Figure
            Plotly figure containing top images and keywords for topics.
        """
        if not hasattr(self, "top_images"):
            raise ValueError(
                "Model either has not been fit or was fit without images. top_images property missing."
            )
        try:
            import plotly.graph_objects as go
        except (ImportError, ModuleNotFoundError) as e:
            raise ModuleNotFoundError(
                "Please install plotly if you intend to use plots in Turftopic."
            ) from e
        fig = go.Figure()
        width, height = 1200, 1200
        scale_factor = 0.25
        w, h = width * scale_factor, height * scale_factor
        padding = 10
        n_components = self.components_.shape[0]
        n_rows = n_components // n_cols + int(bool(n_components % n_cols))
        figure_height = (h + padding) * n_rows
        figure_width = (w + padding) * n_cols
        fig = fig.add_trace(
            go.Scatter(
                x=[0, figure_width],
                y=[0, figure_height],
                mode="markers",
                marker_opacity=0,
            )
        )
        vocab = self.get_vocab()
        for i, component in enumerate(self.components_):
            col = i % n_cols
            row = i // n_cols
            top_7 = vocab[np.argsort(-component)[:7]]
            images = self.top_images[i]
            image = self._image_grid(
                images, (width, height), grid_size=(grid_size, grid_size)
            )
            x0 = (w + padding) * col
            y0 = (h + padding) * (n_rows - row)
            fig = fig.add_layout_image(
                dict(
                    x=x0,
                    sizex=w,
                    y=y0,
                    sizey=h,
                    xref="x",
                    yref="y",
                    opacity=1.0,
                    layer="below",
                    sizing="stretch",
                    source=image,
                ),
            )
            fig.add_annotation(
                x=(w + padding) * col + (w / 2),
                y=(h + padding) * (n_rows - row) - (h / 2),
                text="<b> " + "<br> ".join(top_7),
                font=dict(
                    size=16,
                    family="Times New Roman",
                    color="white",
                ),
                bgcolor="rgba(0,0,0, 0.5)",
            )
        fig = fig.update_xaxes(visible=False, range=[0, figure_width])
        fig = fig.update_yaxes(
            visible=False,
            range=[0, figure_height],
            # the scaleanchor attribute ensures that the aspect ratio stays constant
            scaleanchor="x",
        )
        fig = fig.update_layout(
            width=figure_width,
            height=figure_height,
            margin={"l": 0, "r": 0, "t": 0, "b": 0},
        )
        return fig

encode_multimodal(sentences, images)

Produce multimodal embeddings of the documents passed to the model.

Parameters:

Name Type Description Default
sentences list[str]

Textual documents to encode.

required
images list[ImageRepr]

Corresponding images for each document.

required

Returns:

Type Description
MultimodalEmbeddings

Text, image and joint document embeddings.

Source code in turftopic/multimodal.py
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def encode_multimodal(
    self,
    sentences: list[str],
    images: list[ImageRepr],
) -> dict[str, np.ndarray]:
    """Produce multimodal embeddings of the documents passed to the model.

    Parameters
    ----------
    sentences: list[str]
        Textual documents to encode.
    images: list[ImageRepr]
        Corresponding images for each document.

    Returns
    -------
    MultimodalEmbeddings
        Text, image and joint document embeddings.

    """
    if len(sentences) != len(images):
        raise ValueError("Images and documents were not the same length.")
    if hasattr(self.encoder_, "get_text_embeddings"):
        text_embeddings = np.array(
            self.encoder_.get_text_embeddings(sentences)
        )
    else:
        text_embeddings = self.encoder_.encode(sentences)
    embedding_size = text_embeddings.shape[1]
    images = _load_images(images)
    if hasattr(self.encoder_, "get_image_embeddings"):
        image_embeddings = np.array(
            self.encoder_.get_image_embeddings(list(images))
        )
    else:
        image_embeddings = []
        for image in images:
            if image is not None:
                image_embeddings.append(self.encoder_.encode(image))
            else:
                image_embeddings.append(np.full(embedding_size, np.nan))
        image_embeddings = np.stack(image_embeddings)
        print(image_embeddings)
    if hasattr(self.encoder_, "get_fused_embeddings"):
        document_embeddings = np.array(
            self.encoder_.get_fused_embeddings(
                texts=sentences,
                images=list(images),
            )
        )
    else:
        document_embeddings = _naive_join_embeddings(
            text_embeddings, image_embeddings
        )

    return {
        "text_embeddings": text_embeddings,
        "image_embeddings": image_embeddings,
        "document_embeddings": document_embeddings,
    }

fit_multimodal(raw_documents, images, y=None, embeddings=None)

Fits topic model on a multimodal corpus.

Parameters:

Name Type Description Default
raw_documents list[str]

Documents to fit the model on.

required
images list[ImageRepr]

Images corresponding to each document.

required
y

Ignored, exists for sklearn compatibility.

None
embeddings Optional[MultimodalEmbeddings]

Precomputed multimodal embeddings.

None

Returns:

Type Description
Self

The fitted topic model

Source code in turftopic/multimodal.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def fit_multimodal(
    self,
    raw_documents: list[str],
    images: list[ImageRepr],
    y=None,
    embeddings: Optional[MultimodalEmbeddings] = None,
):
    """Fits topic model on a multimodal corpus.

    Parameters
    ----------
    raw_documents: iterable of str
        Documents to fit the model on.
    images: list[ImageRepr]
        Images corresponding to each document.
    y: None
        Ignored, exists for sklearn compatibility.
    embeddings: MultimodalEmbeddings
        Precomputed multimodal embeddings.

    Returns
    -------
    Self
        The fitted topic model
    """
    self.fit_transform_multimodal(raw_documents, images, y, embeddings)
    return self

fit_transform_multimodal(raw_documents, images, y=None, embeddings=None) abstractmethod

Fits topic model in a multimodal context and returns the document-topic matrix.

Parameters:

Name Type Description Default
raw_documents list[str]

Documents to fit the model on.

required
images list[ImageRepr]

Images corresponding to each document.

required
y

Ignored, exists for sklearn compatibility.

None
embeddings Optional[MultimodalEmbeddings]

Precomputed multimodal embeddings.

None

Returns:

Type Description
ndarray of shape (n_documents, n_topics)

Document-topic matrix.

Source code in turftopic/multimodal.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
@abstractmethod
def fit_transform_multimodal(
    self,
    raw_documents: list[str],
    images: list[ImageRepr],
    y=None,
    embeddings: Optional[MultimodalEmbeddings] = None,
) -> np.ndarray:
    """Fits topic model in a multimodal context and returns the document-topic matrix.

    Parameters
    ----------
    raw_documents: iterable of str
        Documents to fit the model on.
    images: list[ImageRepr]
        Images corresponding to each document.
    y: None
        Ignored, exists for sklearn compatibility.
    embeddings: MultimodalEmbeddings
        Precomputed multimodal embeddings.

    Returns
    -------
    ndarray of shape (n_documents, n_topics)
        Document-topic matrix.
    """
    pass

plot_topics_with_images(n_cols=3, grid_size=4)

Plots the most important images for each topic, along with keywords.

Note that you will need to pip install plotly to use plots in Turftopic.

Parameters:

Name Type Description Default
n_cols int

Number of columns you want to have in the grid of topics.

3
grid_size int

The square root of the number of images you want to display for a given topic. For instance if grid_size==4, all topics will have 16 images displayed, since the joint image will have 4 columns and 4 rows.

4

Returns:

Type Description
Figure

Plotly figure containing top images and keywords for topics.

Source code in turftopic/multimodal.py
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
def plot_topics_with_images(self, n_cols: int = 3, grid_size: int = 4):
    """Plots the most important images for each topic, along with keywords.

    Note that you will need to `pip install plotly` to use plots in Turftopic.

    Parameters
    ----------
    n_cols: int, default 3
        Number of columns you want to have in the grid of topics.
    grid_size: int, default 4
        The square root of the number of images you want to display for a given topic.
        For instance if grid_size==4, all topics will have 16 images displayed,
        since the joint image will have 4 columns and 4 rows.

    Returns
    -------
    go.Figure
        Plotly figure containing top images and keywords for topics.
    """
    if not hasattr(self, "top_images"):
        raise ValueError(
            "Model either has not been fit or was fit without images. top_images property missing."
        )
    try:
        import plotly.graph_objects as go
    except (ImportError, ModuleNotFoundError) as e:
        raise ModuleNotFoundError(
            "Please install plotly if you intend to use plots in Turftopic."
        ) from e
    fig = go.Figure()
    width, height = 1200, 1200
    scale_factor = 0.25
    w, h = width * scale_factor, height * scale_factor
    padding = 10
    n_components = self.components_.shape[0]
    n_rows = n_components // n_cols + int(bool(n_components % n_cols))
    figure_height = (h + padding) * n_rows
    figure_width = (w + padding) * n_cols
    fig = fig.add_trace(
        go.Scatter(
            x=[0, figure_width],
            y=[0, figure_height],
            mode="markers",
            marker_opacity=0,
        )
    )
    vocab = self.get_vocab()
    for i, component in enumerate(self.components_):
        col = i % n_cols
        row = i // n_cols
        top_7 = vocab[np.argsort(-component)[:7]]
        images = self.top_images[i]
        image = self._image_grid(
            images, (width, height), grid_size=(grid_size, grid_size)
        )
        x0 = (w + padding) * col
        y0 = (h + padding) * (n_rows - row)
        fig = fig.add_layout_image(
            dict(
                x=x0,
                sizex=w,
                y=y0,
                sizey=h,
                xref="x",
                yref="y",
                opacity=1.0,
                layer="below",
                sizing="stretch",
                source=image,
            ),
        )
        fig.add_annotation(
            x=(w + padding) * col + (w / 2),
            y=(h + padding) * (n_rows - row) - (h / 2),
            text="<b> " + "<br> ".join(top_7),
            font=dict(
                size=16,
                family="Times New Roman",
                color="white",
            ),
            bgcolor="rgba(0,0,0, 0.5)",
        )
    fig = fig.update_xaxes(visible=False, range=[0, figure_width])
    fig = fig.update_yaxes(
        visible=False,
        range=[0, figure_height],
        # the scaleanchor attribute ensures that the aspect ratio stays constant
        scaleanchor="x",
    )
    fig = fig.update_layout(
        width=figure_width,
        height=figure_height,
        margin={"l": 0, "r": 0, "t": 0, "b": 0},
    )
    return fig

turftopic.encoders.multimodal.MultimodalEncoder

Bases: Protocol

Base class for external encoder models.

Source code in turftopic/encoders/multimodal.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
class MultimodalEncoder(Protocol):
    """Base class for external encoder models."""

    def get_text_embeddings(
        self,
        texts: list[str],
        *,
        batch_size: int = 8,
        **kwargs,
    ): ...

    def get_image_embeddings(
        self,
        images: list[Image.Image],
        *,
        batch_size: int = 8,
        **kwargs,
    ): ...

    def get_fused_embeddings(
        self,
        texts: list[str] = None,
        images: list[Image.Image] = None,
        batch_size: int = 8,
        **kwargs,
    ): ...