Source code for bio_embeddings.embed.plus_rnn_embedder

from typing import Union, List, Generator

import torch
from numpy import ndarray
from plus.config import ModelConfig, RunConfig
from plus.data.alphabets import Protein
from plus.data.dataset import Embedding_dataset, collate_sequences_for_embedding
from plus.model.plus_rnn import PLUS_RNN, get_embedding
from plus.train import Trainer
from plus.utils import set_seeds
from torch.utils.data import DataLoader

from bio_embeddings.embed import EmbedderInterface


[docs]class PLUSRNNEmbedder(EmbedderInterface):
    """PLUS RNN Embedder

    Pre-Training of Deep Bidirectional Protein Sequence Representations with Structural Information
    Seonwoo Min, Seunghyun Park, Siwon Kim, Hyun-Soo Choi, Sungroh Yoon
    https://arxiv.org/abs/1912.05625"""

    name = "plus_rnn"
    number_of_layers = 1
    embedding_dimension = 1024

    necessary_files = ["model_file"]

    _alphabet: Protein
    _model: PLUS_RNN
    _model_cfg: ModelConfig
    _run_cfg: RunConfig

[docs]    def __init__(self, device: Union[None, str, torch.device] = None, **kwargs):
        super().__init__(device, **kwargs)
        # This seed is copied from PLUS
        set_seeds(2020)

        # We inlined the config json files since they aren't shipped with the package
        self._alphabet = Protein()
        self._model_cfg = ModelConfig(input_dim=len(self._alphabet))
        self._model_cfg.model_type = "RNN"
        self._model_cfg.rnn_type = "B"
        self._model_cfg.num_layers = 3
        self._model_cfg.hidden_dim = 512
        self._model_cfg.embedding_dim = 100
        self._run_cfg = RunConfig(sanity_check=True)
        self._run_cfg.batch_size_eval = 512
        self._model = PLUS_RNN(self._model_cfg)
        self._model.load_weights(self._options["model_file"])
        self._model = self._model.to(self._device)

[docs]    def embed_batch(self, batch: List[str]) -> Generator[ndarray, None, None]:
        sequences = [
            self._alphabet.encode(sequence.encode().upper()) for sequence in batch
        ]
        test_dataset = [torch.from_numpy(sequence).long() for sequence in sequences]
        test_dataset = Embedding_dataset(
            test_dataset, self._alphabet, self._run_cfg, True
        )

        iterator_test = DataLoader(
            test_dataset,
            self._run_cfg.batch_size_eval,
            collate_fn=collate_sequences_for_embedding,
        )

        model_list = [self._model, "", True, False, False]
        tasks_list = [["", [], []]]  # list of lists [idx, metrics_train, metrics_eval]
        trainer = Trainer([model_list], get_embedding, self._run_cfg, tasks_list)
        for tokens, lengths in iterator_test:
            # https://github.com/pytorch/pytorch/issues/43227
            batch = (tokens.to(self._device), lengths)
            trainer.embed(batch, {"data_parallel": False})

        embeddings = trainer.tasks_dict["results_eval"][0]["embeddings"]
        # 1 is d_h with 1024 dimensions
        for i in range(len(embeddings[0])):
            yield embeddings[1][i].numpy()

        trainer.reset()

[docs]    def embed(self, sequence: str) -> ndarray:
        [embedding] = self.embed_batch([sequence])
        return embedding

[docs]    @staticmethod
    def reduce_per_protein(embedding: ndarray) -> ndarray:
        return embedding.mean(axis=0)