{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Colab initialization\n", "- install the pipeline in the colab runtime\n", "- download files neccessary for this example" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip3 install -U pip > /dev/null\n", "!pip3 install -U bio_embeddings[all] > /dev/null" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!wget http://data.bioembeddings.com/public/embeddings/notebooks/pipeline_output_example/mapping_file.csv --output-document mapping_file.csv\n", "!wget http://data.bioembeddings.com/public/embeddings/notebooks/pipeline_output_example/reduced_embeddings_file.h5 --output-document reduced_embeddings_file.h5\n", "!wget http://data.bioembeddings.com/public/embeddings/notebooks/custom_data/annotation_file.csv --output-document annotation_file.csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Visualize sequence spaces drawn by embeddings\n", "In this notebook, we use the output of the _embed_ stage to draw custom t-SNE sequence space plots." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "import h5py\n", "import numpy as np\n", "from pandas import read_csv\n", "from bio_embeddings.project import tsne_reduce\n", "from bio_embeddings.visualize import render_3D_scatter_plotly" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "mapping_file = read_csv('mapping_file.csv', index_col=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "embeddings = []\n", "with h5py.File('reduced_embeddings_file.h5', 'r') as f:\n", " for remapped_id in mapping_file.index:\n", " embeddings.append(np.array(f[remapped_id]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "options = {\n", " 'perplexity': 3,\n", " 'n_iter': 500\n", "}\n", "\n", "projected_embeddings = tsne_reduce(embeddings, **options)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "mapping_file['component_0'] = projected_embeddings[:, 0]\n", "mapping_file['component_1'] = projected_embeddings[:, 1]\n", "mapping_file['component_2'] = projected_embeddings[:, 2]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "annotation_file = read_csv('annotation_file.csv', index_col=0)\n", "\n", "merged_annotation_file = annotation_file.join(mapping_file.set_index('original_id'), how='outer')\n", "merged_annotation_file['label'].fillna('UNKNOWN', inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "figure = render_3D_scatter_plotly(merged_annotation_file)\n", "figure.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 2 }