{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true,
    "pycharm": {
     "is_executing": false
    }
   },
   "source": [
    "# Colab initialization\n",
    "- install the pipeline in the colab runtime\n",
    "- download files neccessary for this example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip3 install -U pip > /dev/null\n",
    "!pip3 install -U bio_embeddings[all] > /dev/null"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!wget http://data.bioembeddings.com/public/embeddings/reference/goa/protbert_reference_embeddings.h5 --output-document protbert_reference_embeddings.h5\n",
    "!wget http://data.bioembeddings.com/public/embeddings/reference/goa/annotations.csv --output-document annotations.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Embed a sequence and find closest hit in an annotated source\n",
    "\n",
    "Using the annotated source from [goPredSim](https://github.com/Rostlab/goPredSim/), we will transfer GO annotations to a user supplied sequence.\n",
    "\n",
    "\n",
    "Some initial steps are explained in greater detail in the `pairwise_distances_and_nearest_neighbours` notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bio_embeddings.embed import ProtTransBertBFDEmbedder\n",
    "from bio_embeddings.extract import pairwise_distance_matrix_from_embeddings_and_annotations, get_k_nearest_neighbours"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize the embedder\n",
    "embedder = ProtTransBertBFDEmbedder()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sequence = \"MALLHSARVLSGVASAFHPGLAAAASARASSWWAHVEMGPPDPILGVTEAYKRDTNSKKMNLGVGAYRDDNGKPYVLPSVRKAEAQIAAKGLDKEYLPIGGLAEFCRASAELALGENSEVVKSGRFVTVQTISGTGALRIGASFLQRFFKFSRDVFLPKPSWGNHTPIFRDAGMQLQSYRYYDPKTCGFDFTGALEDISKIPEQSVLLLHACAHNPTGVDPRPEQWKEIATVVKKRNLFAFFDMAYQGFASGDGDKDAWAVRHFIEQGINVCLCQSYAKNMGLYGERVGAFTVICKDADEAKRVESQLKILIRPMYSNPPIHGARIASTILTSPDLRKQWLQEVKGMADRIIGMRTQLVSNLKKEGSTHSWQHITDQIGMFCFTGLKPEQVERLTKEFSIYMTKDGRISVAGVTSGNVGYLAHAIHQVTK\"\n",
    "reduced_embedding = embedder.reduce_per_protein(embedder.embed(sequence))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import h5py\n",
    "\n",
    "with h5py.File(\"embeddings.h5\", \"w\") as embeddings_file:\n",
    "    embeddings_file.create_dataset(\"my_sequence\", data=reduced_embedding)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "metric=\"euclidean\"\n",
    "\n",
    "pairwise_distances = pairwise_distance_matrix_from_embeddings_and_annotations(\n",
    "    'embeddings.h5',\n",
    "    'protbert_reference_embeddings.h5',\n",
    "    metric=metric\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the indices and distances to the k-nearest neighbours, then get their identifiers\n",
    "k = 2\n",
    "k_nn_indices, k_nn_distances = get_k_nearest_neighbours(pairwise_distances.pairwise_matrix, k)\n",
    "k_nn_identifiers = list(map(pairwise_distances.references.__getitem__, k_nn_indices[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# GoPredSim scales distances/similarities to a reliability index.\n",
    "# Note that the following was only asserted for metric='euclidean' or 'cosine'\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "if metric == 'euclidean':\n",
    "  k_nn_RI = [0.5/(0.5+dist) for dist in k_nn_distances[0]]\n",
    "elif metric == 'cosine':\n",
    "  k_nn_RI = [1-dist for dist in k_nn_distances[0]]\n",
    "else:\n",
    "  k_nn_RI = [-np.inf] * len(k_nn_distances[0])"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "from pandas import DataFrame, read_csv\n",
    "\n",
    "reference_annotations = read_csv(\"annotations.csv\")\n",
    "k_nns = DataFrame({metric: k_nn_distances[0], \"RI\": k_nn_RI}, index=k_nn_identifiers)\n",
    "k_nn_groups = reference_annotations.join(k_nns, on=\"identifier\").dropna().groupby([\"identifier\", metric, \"RI\"])\n",
    "k_nn_groups = sorted(k_nn_groups, key=lambda x: x[0][1])"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Metric used: {metric}.\")\n",
    "print(\"If you use a distance metric, the smaller the value, the more similar the embeddings.\")\n",
    "print(\"If you use a similarity metric, the smaller the value, the less similar the embeddings.\")\n",
    "print(\"\\n\\n\")\n",
    "\n",
    "\n",
    "for (protein, distance, RI), group in k_nn_groups:\n",
    "    print(f\"{protein}\")\n",
    "    print(f\"  {metric}: {round(distance, 3)}\")\n",
    "    print(f\"  RI: {round(RI, 2)}\")\n",
    "    print(\"The following GO annotations can be transferred from this protein:\")\n",
    "    for label in group.label.unique():\n",
    "        print(f\"  - {label}: http://amigo.geneontology.org/amigo/term/{label}\")\n",
    "    print(\"-----------\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}