{ "cells": [ { "cell_type": "markdown", "metadata": { "collapsed": true, "pycharm": { "is_executing": false } }, "source": [ "# Colab initialization\n", "- install the pipeline in the colab runtime\n", "- download files neccessary for this example" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip3 install -U pip > /dev/null\n", "!pip3 install -U bio_embeddings[all] > /dev/null" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!wget http://data.bioembeddings.com/public/embeddings/reference/goa/protbert_reference_embeddings.h5 --output-document protbert_reference_embeddings.h5\n", "!wget http://data.bioembeddings.com/public/embeddings/reference/goa/annotations.csv --output-document annotations.csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Embed a sequence and find closest hit in an annotated source\n", "\n", "Using the annotated source from [goPredSim](https://github.com/Rostlab/goPredSim/), we will transfer GO annotations to a user supplied sequence.\n", "\n", "\n", "Some initial steps are explained in greater detail in the `pairwise_distances_and_nearest_neighbours` notebook." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from bio_embeddings.embed import ProtTransBertBFDEmbedder\n", "from bio_embeddings.extract import pairwise_distance_matrix_from_embeddings_and_annotations, get_k_nearest_neighbours" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Initialize the embedder\n", "embedder = ProtTransBertBFDEmbedder()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sequence = \"MALLHSARVLSGVASAFHPGLAAAASARASSWWAHVEMGPPDPILGVTEAYKRDTNSKKMNLGVGAYRDDNGKPYVLPSVRKAEAQIAAKGLDKEYLPIGGLAEFCRASAELALGENSEVVKSGRFVTVQTISGTGALRIGASFLQRFFKFSRDVFLPKPSWGNHTPIFRDAGMQLQSYRYYDPKTCGFDFTGALEDISKIPEQSVLLLHACAHNPTGVDPRPEQWKEIATVVKKRNLFAFFDMAYQGFASGDGDKDAWAVRHFIEQGINVCLCQSYAKNMGLYGERVGAFTVICKDADEAKRVESQLKILIRPMYSNPPIHGARIASTILTSPDLRKQWLQEVKGMADRIIGMRTQLVSNLKKEGSTHSWQHITDQIGMFCFTGLKPEQVERLTKEFSIYMTKDGRISVAGVTSGNVGYLAHAIHQVTK\"\n", "reduced_embedding = embedder.reduce_per_protein(embedder.embed(sequence))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import h5py\n", "\n", "with h5py.File(\"embeddings.h5\", \"w\") as embeddings_file:\n", " embeddings_file.create_dataset(\"my_sequence\", data=reduced_embedding)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "metric=\"euclidean\"\n", "\n", "pairwise_distances = pairwise_distance_matrix_from_embeddings_and_annotations(\n", " 'embeddings.h5',\n", " 'protbert_reference_embeddings.h5',\n", " metric=metric\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get the indices and distances to the k-nearest neighbours, then get their identifiers\n", "k = 2\n", "k_nn_indices, k_nn_distances = get_k_nearest_neighbours(pairwise_distances.pairwise_matrix, k)\n", "k_nn_identifiers = list(map(pairwise_distances.references.__getitem__, k_nn_indices[0]))" ] }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "# GoPredSim scales distances/similarities to a reliability index.\n", "# Note that the following was only asserted for metric='euclidean' or 'cosine'\n", "import numpy as np\n", "\n", "\n", "if metric == 'euclidean':\n", " k_nn_RI = [0.5/(0.5+dist) for dist in k_nn_distances[0]]\n", "elif metric == 'cosine':\n", " k_nn_RI = [1-dist for dist in k_nn_distances[0]]\n", "else:\n", " k_nn_RI = [-np.inf] * len(k_nn_distances[0])" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "from pandas import DataFrame, read_csv\n", "\n", "reference_annotations = read_csv(\"annotations.csv\")\n", "k_nns = DataFrame({metric: k_nn_distances[0], \"RI\": k_nn_RI}, index=k_nn_identifiers)\n", "k_nn_groups = reference_annotations.join(k_nns, on=\"identifier\").dropna().groupby([\"identifier\", metric, \"RI\"])\n", "k_nn_groups = sorted(k_nn_groups, key=lambda x: x[0][1])" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f\"Metric used: {metric}.\")\n", "print(\"If you use a distance metric, the smaller the value, the more similar the embeddings.\")\n", "print(\"If you use a similarity metric, the smaller the value, the less similar the embeddings.\")\n", "print(\"\\n\\n\")\n", "\n", "\n", "for (protein, distance, RI), group in k_nn_groups:\n", " print(f\"{protein}\")\n", " print(f\" {metric}: {round(distance, 3)}\")\n", " print(f\" RI: {round(RI, 2)}\")\n", " print(\"The following GO annotations can be transferred from this protein:\")\n", " for label in group.label.unique():\n", " print(f\" - {label}: http://amigo.geneontology.org/amigo/term/{label}\")\n", " print(\"-----------\\n\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 1 }