{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Colab initialization\n",
    "- install the pipeline in the colab runtime\n",
    "- download files neccessary for this example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "!pip3 install -U pip > /dev/null\n",
    "!pip3 install -U bio_embeddings[all] > /dev/null"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!wget http://data.bioembeddings.com/public/embeddings/notebooks/pipeline_output_example/prottrans_bert_embeddings_file.h5 --output-document prottrans_bert_embeddings_file.h5\n",
    "!wget http://data.bioembeddings.com/public/embeddings/notebooks/pipeline_output_example/seqvec_embeddings_file.h5 --output-document seqvec_embeddings_file.h5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualize the RAW embeddings as images\n",
    "This can be helpful to spot any patterns, visually"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "import h5py\n",
    "import plotly\n",
    "import numpy as np\n",
    "import plotly.express as px"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "proteins = []\n",
    "with h5py.File('prottrans_bert_embeddings_file.h5', 'r') as f:\n",
    "    for protein_id in f.keys():\n",
    "        proteins.append((protein_id, np.array(f[protein_id])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "os.mkdir(\"figures\")\n",
    "for (identifier, embedding) in proteins:\n",
    "    fig = px.imshow(np.rot90(embedding), color_continuous_scale='RdBu', zmin=-.7, zmax=.7)\n",
    "    plotly.offline.plot(fig, filename=\"figures/\"+identifier+\"_prottrans_bert.html\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For the next block of code to work, you should generate some seqvec embeddings and store them as `seqvec_embeddings_file.h5` in the `pipeline_output_example` folder.\n",
    "\n",
    "Note that we are picking the second embedding layer from the seqvec embeddings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "proteins = []\n",
    "with h5py.File('seqvec_embeddings_file.h5', 'r') as f:\n",
    "    for protein_id in f.keys():\n",
    "        proteins.append((protein_id, np.array(f[protein_id])[1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for (identifier, embedding) in proteins:\n",
    "    fig = px.imshow(np.rot90(embedding), color_continuous_scale='RdBu', zmin=-1.2, zmax=1.2)\n",
    "    plotly.offline.plot(fig, filename=\"figures/\"+identifier+\"_seqvec.html\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}