Colab initialization

  • install the pipeline in the colab runtime

  • download files neccessary for this example

!pip3 install -U pip > /dev/null
!pip3 install -U bio_embeddings[all] > /dev/null
!wget http://data.bioembeddings.com/public/embeddings/notebooks/pipeline_output_example/mapping_file.csv --output-document mapping_file.csv
!wget http://data.bioembeddings.com/public/embeddings/notebooks/pipeline_output_example/reduced_embeddings_file.h5 --output-document reduced_embeddings_file.h5
!wget http://data.bioembeddings.com/public/embeddings/notebooks/custom_data/annotation_file.csv --output-document annotation_file.csv

Visualize sequence spaces drawn by embeddings

In this notebook, we use the output of the embed stage to draw custom t-SNE sequence space plots.

import h5py
import numpy as np
from pandas import read_csv
from bio_embeddings.project import tsne_reduce
from bio_embeddings.visualize import render_3D_scatter_plotly
mapping_file = read_csv('mapping_file.csv', index_col=0)
embeddings = []
with h5py.File('reduced_embeddings_file.h5', 'r') as f:
    for remapped_id in mapping_file.index:
        embeddings.append(np.array(f[remapped_id]))
options = {
    'perplexity': 3,
    'n_iter': 500
}

projected_embeddings = tsne_reduce(embeddings, **options)
mapping_file['component_0'] = projected_embeddings[:, 0]
mapping_file['component_1'] = projected_embeddings[:, 1]
mapping_file['component_2'] = projected_embeddings[:, 2]
annotation_file = read_csv('annotation_file.csv', index_col=0)

merged_annotation_file = annotation_file.join(mapping_file.set_index('original_id'), how='outer')
merged_annotation_file['label'].fillna('UNKNOWN', inplace=True)
figure = render_3D_scatter_plotly(merged_annotation_file)
figure.show()