Colab initialization¶

install the pipeline in the colab runtime

!pip3 install -U pip > /dev/null
!pip3 install -U "bio-embeddings[all] @ git+https://github.com/sacdallago/bio_embeddings.git" > /dev/null

!wget http://data.bioembeddings.com/public/embeddings/notebooks/custom_data/antibodies_dummy.xlsx --output-document antibodies_dummy.xlsx

Visualize sequence space for custom embeddings¶

In this notebook we dig deeper into an analytical application of protein LM embeddings. For this example, a private set of antibodies was used. This notebook is for illustrative purposes only.

import numpy as np
from IPython.core.display import display
from bio_embeddings.embed import SeqVecEmbedder
from bio_embeddings.project import tsne_reduce
from bio_embeddings.visualize import render_3D_scatter_plotly
from pandas import read_excel, DataFrame

%matplotlib inline

# Construct the embedder class.

embedder = SeqVecEmbedder()

# Read excel sheet and display first two entries

data = read_excel('antibodies_dummy.xlsx', index_col=0)
display(data[:2])

# Generate embeddings for heay and light CDR3 AA sequences

heavy_embeddings = [embedder.reduce_per_protein(embedding) for embedding in embedder.embed_many(data['HEAVY CDR3 (aa)'])]
ligth_embeddings = [embedder.reduce_per_protein(embedding) for embedding in embedder.embed_many(data['LIGHT CDR3 (aa)'])]

# If merge is set to True, embeddings will be summed and the resulting vector will be of size 1024
# If merge is set to False, embeddings will be concatenated, and the resulting vector will be of size 2048
merge = False

_data = []

for heavy_part, light_part in zip(heavy_embeddings, ligth_embeddings):
    if merge:
        _data.append(heavy_part+light_part)
    else:
        _data.append(np.concatenate([heavy_part, light_part]))

transformed_embeddings = tsne_reduce(_data)

embeddings_dataframe = DataFrame(transformed_embeddings, columns=["component_0", "component_1", "component_2"])
embeddings_dataframe['label'] = data['Poly'].values
embeddings_dataframe.index = data.index

fig = render_3D_scatter_plotly(embeddings_dataframe=embeddings_dataframe)
fig.show()