Colab initialization

  • install the pipeline in the colab runtime

!pip3 install -U pip > /dev/null
!pip3 install -U bio_embeddings[all] > /dev/null
!wget --output-document antibodies_dummy.xlsx

Visualize sequence space for custom embeddings

In this notebook we dig deeper into an analytical application of protein LM embeddings. For this example, a private set of antibodies was used. This notebook is for illustrative purposes only.

import numpy as np
from IPython.core.display import display
from bio_embeddings.embed import SeqVecEmbedder
from bio_embeddings.project import tsne_reduce
from bio_embeddings.visualize import render_3D_scatter_plotly
from pandas import read_excel, DataFrame

%matplotlib inline
# Construct the embedder class.

embedder = SeqVecEmbedder()
# Read excel sheet and display first two entries

data = read_excel('antibodies_dummy.xlsx', index_col=0)
# Generate embeddings for heay and light CDR3 AA sequences

heavy_embeddings = [embedder.reduce_per_protein(embedding) for embedding in embedder.embed_many(data['HEAVY CDR3 (aa)'])]
ligth_embeddings = [embedder.reduce_per_protein(embedding) for embedding in embedder.embed_many(data['LIGHT CDR3 (aa)'])]
# If merge is set to True, embeddings will be summed and the resulting vector will be of size 1024
# If merge is set to False, embeddings will be concatenated, and the resulting vector will be of size 2048
merge = False

_data = []

for heavy_part, light_part in zip(heavy_embeddings, ligth_embeddings):
    if merge:
        _data.append(np.concatenate([heavy_part, light_part]))

transformed_embeddings = tsne_reduce(_data)
embeddings_dataframe = DataFrame(transformed_embeddings, columns=["component_0", "component_1", "component_2"])
embeddings_dataframe['label'] = data['Poly'].values
embeddings_dataframe.index = data.index
fig = render_3D_scatter_plotly(embeddings_dataframe=embeddings_dataframe)