{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Colab initialization\n", "- install the pipeline in the colab runtime\n", "- download files neccessary for this example" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip3 install -U pip > /dev/null\n", "!pip3 install -U bio_embeddings[all] > /dev/null" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!wget http://data.bioembeddings.com/public/embeddings/notebooks/custom_data/tiny_sampled.fasta --output-document tiny_sampled.fasta" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Embed sequences in a FASTA file" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from bio_embeddings.embed import ProtTransBertBFDEmbedder\n", "from Bio import SeqIO" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sequences = []\n", "for record in SeqIO.parse(\"tiny_sampled.fasta\", \"fasta\"):\n", " sequences.append(record)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "embedder = ProtTransBertBFDEmbedder()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "embeddings = embedder.embed_many([str(s.seq) for s in sequences])\n", "\n", "# `embed_many` returns a generator.\n", "# We want to keep both RAW embeddings and reduced embeddings in memory.\n", "# To do so, we simply turn the generator into a list!\n", "# (this will start embedding the sequences!)\n", "\n", "embeddings = list(embeddings)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "reduced_embeddings = [ProtTransBertBFDEmbedder.reduce_per_protein(e) for e in embeddings]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for (per_amino_acid, per_protein) in zip(embeddings, reduced_embeddings):\n", " print(per_amino_acid.shape, per_protein.shape)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" }, "pycharm": { "stem_cell": { "cell_type": "raw", "source": [], "metadata": { "collapsed": false } } } }, "nbformat": 4, "nbformat_minor": 2 }