Source code for bio_embeddings.mutagenesis.protbert_bfd

import logging
import re
from typing import Union, Optional, List, Dict

import torch
import transformers
from tqdm import tqdm
from transformers import BertTokenizer, BertForMaskedLM

from bio_embeddings.embed import ProtTransBertBFDEmbedder
from bio_embeddings.mutagenesis import AMINO_ACIDS
from bio_embeddings.utilities import (
    get_device,
    get_model_directories_from_zip,
)


class FilterBertForMaskedLMWeightsWarning(logging.Filter):
    """transformers complains that we don't use some of the weights with BertForMaskedLM instead of BertModel,
    which we can ignore"""

    def filter(self, record: logging.LogRecord) -> bool:
        return (
            "were not used when initializing BertForMaskedLM: "
            "['cls.seq_relationship.weight', 'cls.seq_relationship.bias']"
            not in record.getMessage()
        )


transformers.modeling_utils.logger.addFilter(FilterBertForMaskedLMWeightsWarning())


[docs]class ProtTransBertBFDMutagenesis:
    """BETA: in-silico mutagenesis using BertForMaskedLM"""

    device: torch.device
    model: BertForMaskedLM
    tokenizer: BertTokenizer
    _half_precision_model: bool

[docs]    def __init__(
        self,
        device: Union[None, str, torch.device] = None,
        model_directory: Optional[str] = None,
        half_precision_model: bool = False,
    ):
        """Loads the Bert Model for Masked LM"""
        self.device = get_device(device)
        self._half_precision_model = half_precision_model

        if not model_directory:
            model_directory = get_model_directories_from_zip(
                model=ProtTransBertBFDEmbedder.name, directory="model_directory"
            )
        self.tokenizer = BertTokenizer.from_pretrained(
            model_directory, do_lower_case=False
        )
        self.model = BertForMaskedLM.from_pretrained(model_directory)
        # Compute in half precision, which is a lot faster and saves us half the memory
        if self._half_precision_model:
            self.model = self.model.half()
        self.model = self.model.eval().to(self.device)

[docs]    def get_sequence_probabilities(
        self,
        sequence: str,
        temperature: float = 1,
        start: Optional[int] = None,
        stop: Optional[int] = None,
        progress_bar: Optional[tqdm] = None,
    ) -> List[Dict[str, float]]:
        """Returns the likelihood for each of the 20 natural amino acids to be at residue positions between `start` and
        `end` considering the context of the remainder of the sequence (aka: by using. BERT's mask token and
        reconstructing the corrupted sequence). Probabilities may be adjusted by a `temperature` factor.
        If set to `1` (default) no adjustment is made.

        :param sequence: The amino acid sequence. Please pass whole sequences, not regions
        :param start: the start index (inclusive) of the region for which to compute residue probabilities (starting with 0)
        :param stop: the end (exclusive) of the region for which to compute residue probabilities
        :param temperature: temperature for the softmax computation
        :param progress_bar: optional tqdm progress bar

        :return: An ordered list for the region of probabilities for each of the 20 natural amino acids to be at said
        position."""
        # https://stackoverflow.com/questions/59435020/get-probability-of-multi-token-word-in-mask-position

        # init softmax to get mutagenesis later on
        sm = torch.nn.Softmax(dim=0)
        AA_tokens = [
            self.tokenizer.convert_tokens_to_ids(AA) for AA in list(AMINO_ACIDS)
        ]

        # Create L sequences with each position masked once
        probabilities_list = list()

        # Remove rare amino acids
        current_sequence = re.sub(r"[UZOB]", "X", sequence)

        # Mask each token individually
        for i in range(start or 0, stop or len(sequence)):
            masked_sequence = list(current_sequence)
            masked_sequence = (
                masked_sequence[:i]
                + [self.tokenizer.mask_token]
                + masked_sequence[i + 1 :]
            )
            # Each AA is a word, so we need spaces in between
            masked_sequence = " ".join(masked_sequence)
            tokenized_sequence = self.tokenizer.encode(
                masked_sequence, return_tensors="pt"
            )

            # get the position of the masked token
            # noinspection PyTypeChecker
            masked_position = torch.nonzero(
                tokenized_sequence.squeeze() == self.tokenizer.mask_token_id
            ).item()

            # TODO: can batch this!
            output = self.model(tokenized_sequence.to(self.device))
            last_hidden_state = output[0].squeeze(0)
            # only get output for masked token
            # output is the size of the vocabulary
            mask_hidden_state = last_hidden_state[masked_position].cpu()

            # convert to mutagenesis (softmax)
            # giving a probability for each item in the vocabulary
            probabilities = sm(mask_hidden_state / temperature)

            # Get a dictionary of AA and probability of it being there at given position
            result = dict(
                zip(list(AMINO_ACIDS), [probabilities[AA].item() for AA in AA_tokens])
            )
            result["position"] = i

            # Append orderly to mutagenesis
            probabilities_list.append(result)

            if progress_bar:
                progress_bar.update()

        return probabilities_list