Colab initialization¶
install the pipeline in the colab runtime
!pip3 install -U pip > /dev/null
!pip3 install -U bio_embeddings[all] > /dev/null
!wget http://data.bioembeddings.com/public/embeddings/notebooks/custom_data/antibodies_dummy.xlsx --output-document antibodies_dummy.xlsx
Visualize sequence space for custom embeddings¶
In this notebook we dig deeper into an analytical application of protein LM embeddings. For this example, a private set of antibodies was used. This notebook is for illustrative purposes only.
import numpy as np
from IPython.core.display import display
from bio_embeddings.embed import SeqVecEmbedder
from bio_embeddings.project import tsne_reduce
from bio_embeddings.visualize import render_3D_scatter_plotly
from pandas import read_excel, DataFrame
%matplotlib inline
# Construct the embedder class.
embedder = SeqVecEmbedder()
# Read excel sheet and display first two entries
data = read_excel('antibodies_dummy.xlsx', index_col=0)
display(data[:2])
# Generate embeddings for heay and light CDR3 AA sequences
heavy_embeddings = [embedder.reduce_per_protein(embedding) for embedding in embedder.embed_many(data['HEAVY CDR3 (aa)'])]
ligth_embeddings = [embedder.reduce_per_protein(embedding) for embedding in embedder.embed_many(data['LIGHT CDR3 (aa)'])]
# If merge is set to True, embeddings will be summed and the resulting vector will be of size 1024
# If merge is set to False, embeddings will be concatenated, and the resulting vector will be of size 2048
merge = False
_data = []
for heavy_part, light_part in zip(heavy_embeddings, ligth_embeddings):
if merge:
_data.append(heavy_part+light_part)
else:
_data.append(np.concatenate([heavy_part, light_part]))
transformed_embeddings = tsne_reduce(_data)
embeddings_dataframe = DataFrame(transformed_embeddings, columns=["component_0", "component_1", "component_2"])
embeddings_dataframe['label'] = data['Poly'].values
embeddings_dataframe.index = data.index
fig = render_3D_scatter_plotly(embeddings_dataframe=embeddings_dataframe)
fig.show()