{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Colab initialization\n", "- install the pipeline in the colab runtime\n", "- download files neccessary for this example" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "!pip3 install -U pip > /dev/null\n", "!pip3 install -U bio_embeddings[all] > /dev/null" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!wget http://data.bioembeddings.com/public/embeddings/notebooks/pipeline_output_example/prottrans_bert_embeddings_file.h5 --output-document prottrans_bert_embeddings_file.h5\n", "!wget http://data.bioembeddings.com/public/embeddings/notebooks/pipeline_output_example/seqvec_embeddings_file.h5 --output-document seqvec_embeddings_file.h5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Visualize the RAW embeddings as images\n", "This can be helpful to spot any patterns, visually" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": true } }, "outputs": [], "source": [ "import h5py\n", "import plotly\n", "import numpy as np\n", "import plotly.express as px" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "proteins = []\n", "with h5py.File('prottrans_bert_embeddings_file.h5', 'r') as f:\n", " for protein_id in f.keys():\n", " proteins.append((protein_id, np.array(f[protein_id])))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "os.mkdir(\"figures\")\n", "for (identifier, embedding) in proteins:\n", " fig = px.imshow(np.rot90(embedding), color_continuous_scale='RdBu', zmin=-.7, zmax=.7)\n", " plotly.offline.plot(fig, filename=\"figures/\"+identifier+\"_prottrans_bert.html\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For the next block of code to work, you should generate some seqvec embeddings and store them as `seqvec_embeddings_file.h5` in the `pipeline_output_example` folder.\n", "\n", "Note that we are picking the second embedding layer from the seqvec embeddings." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "proteins = []\n", "with h5py.File('seqvec_embeddings_file.h5', 'r') as f:\n", " for protein_id in f.keys():\n", " proteins.append((protein_id, np.array(f[protein_id])[1]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for (identifier, embedding) in proteins:\n", " fig = px.imshow(np.rot90(embedding), color_continuous_scale='RdBu', zmin=-1.2, zmax=1.2)\n", " plotly.offline.plot(fig, filename=\"figures/\"+identifier+\"_seqvec.html\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 2 }