Colab initialization¶

install the pipeline in the colab runtime
download files neccessary for this example

!pip3 install -U pip > /dev/null
!pip3 install -U bio_embeddings[all] > /dev/null

!wget http://data.bioembeddings.com/public/embeddings/reference/goa/protbert_reference_embeddings.h5 --output-document protbert_reference_embeddings.h5
!wget http://data.bioembeddings.com/public/embeddings/reference/goa/annotations.csv --output-document annotations.csv

Embed a sequence and find closest hit in an annotated source¶

Using the annotated source from goPredSim, we will transfer GO annotations to a user supplied sequence.

Some initial steps are explained in greater detail in the pairwise_distances_and_nearest_neighbours notebook.

from bio_embeddings.embed import ProtTransBertBFDEmbedder
from bio_embeddings.extract import pairwise_distance_matrix_from_embeddings_and_annotations, get_k_nearest_neighbours

# Initialize the embedder
embedder = ProtTransBertBFDEmbedder()

sequence = "MALLHSARVLSGVASAFHPGLAAAASARASSWWAHVEMGPPDPILGVTEAYKRDTNSKKMNLGVGAYRDDNGKPYVLPSVRKAEAQIAAKGLDKEYLPIGGLAEFCRASAELALGENSEVVKSGRFVTVQTISGTGALRIGASFLQRFFKFSRDVFLPKPSWGNHTPIFRDAGMQLQSYRYYDPKTCGFDFTGALEDISKIPEQSVLLLHACAHNPTGVDPRPEQWKEIATVVKKRNLFAFFDMAYQGFASGDGDKDAWAVRHFIEQGINVCLCQSYAKNMGLYGERVGAFTVICKDADEAKRVESQLKILIRPMYSNPPIHGARIASTILTSPDLRKQWLQEVKGMADRIIGMRTQLVSNLKKEGSTHSWQHITDQIGMFCFTGLKPEQVERLTKEFSIYMTKDGRISVAGVTSGNVGYLAHAIHQVTK"
reduced_embedding = embedder.reduce_per_protein(embedder.embed(sequence))

import h5py

with h5py.File("embeddings.h5", "w") as embeddings_file:
    embeddings_file.create_dataset("my_sequence", data=reduced_embedding)

metric="euclidean"

pairwise_distances = pairwise_distance_matrix_from_embeddings_and_annotations(
    'embeddings.h5',
    'protbert_reference_embeddings.h5',
    metric=metric
)

# Get the indices and distances to the k-nearest neighbours, then get their identifiers
k = 2
k_nn_indices, k_nn_distances = get_k_nearest_neighbours(pairwise_distances.pairwise_matrix, k)
k_nn_identifiers = list(map(pairwise_distances.references.__getitem__, k_nn_indices[0]))

# GoPredSim scales distances/similarities to a reliability index.
# Note that the following was only asserted for metric='euclidean' or 'cosine'
import numpy as np


if metric == 'euclidean':
  k_nn_RI = [0.5/(0.5+dist) for dist in k_nn_distances[0]]
elif metric == 'cosine':
  k_nn_RI = [1-dist for dist in k_nn_distances[0]]
else:
  k_nn_RI = [-np.inf] * len(k_nn_distances[0])

from pandas import DataFrame, read_csv

reference_annotations = read_csv("annotations.csv")
k_nns = DataFrame({metric: k_nn_distances[0], "RI": k_nn_RI}, index=k_nn_identifiers)
k_nn_groups = reference_annotations.join(k_nns, on="identifier").dropna().groupby(["identifier", metric, "RI"])
k_nn_groups = sorted(k_nn_groups, key=lambda x: x[0][1])

print(f"Metric used: {metric}.")
print("If you use a distance metric, the smaller the value, the more similar the embeddings.")
print("If you use a similarity metric, the smaller the value, the less similar the embeddings.")
print("\n\n")


for (protein, distance, RI), group in k_nn_groups:
    print(f"{protein}")
    print(f"  {metric}: {round(distance, 3)}")
    print(f"  RI: {round(RI, 2)}")
    print("The following GO annotations can be transferred from this protein:")
    for label in group.label.unique():
        print(f"  - {label}: http://amigo.geneontology.org/amigo/term/{label}")
    print("-----------\n")