Source code for bio_embeddings.extract.basic.basic_annotation_extractor

import collections
import logging
from typing import List, Union, Dict, Any

import numpy
import torch
from numpy import ndarray

from bio_embeddings.extract.annotations import Location, Membrane, Disorder, SecondaryStructure
from bio_embeddings.extract.basic.annotation_inference_models import SubCellFNN, SecStructCNN
from bio_embeddings.utilities import get_device, get_model_file

logger = logging.getLogger(__name__)

# Label mappings
_location_labels = {
    0: Location.CELL_MEMBRANE,
    1: Location.CYTOPLASM,
    2: Location.ENDOPLASMATIC_RETICULUM,
    3: Location.GOLGI_APPARATUS,
    4: Location.LYSOSOME_OR_VACUOLE,
    5: Location.MITOCHONDRION,
    6: Location.NUCLEUS,
    7: Location.PEROXISOME,
    8: Location.PLASTID,
    9: Location.EXTRACELLULAR
}

_membrane_labels = {
    0: Membrane.SOLUBLE,
    1: Membrane.MEMBRANE
}

_dssp8_labels = {
    0: SecondaryStructure.THREE_HELIX,
    1: SecondaryStructure.ALPHA_HELIX,
    2: SecondaryStructure.FIVE_HELIX,
    3: SecondaryStructure.ISOLATED_BETA_BRIDGE,
    4: SecondaryStructure.EXTENDED_STRAND,
    5: SecondaryStructure.BEND,
    6: SecondaryStructure.TURN,
    7: SecondaryStructure.IRREGULAR
}

_dssp3_labels = {
    0: SecondaryStructure.ALPHA_HELIX,
    1: SecondaryStructure.EXTENDED_STRAND,
    2: SecondaryStructure.IRREGULAR
}

_disorder_labels = {
    0: Disorder.ORDER,
    1: Disorder.DISORDER
}

BasicSecondaryStructureResult = collections.namedtuple('BasicSecondaryStructureResult', 'DSSP3 DSSP8 disorder '
                                                                                        'DSSP3_raw DSSP8_raw disorder_raw')
SubcellularLocalizationAndMembraneBoundness = collections.namedtuple('SubcellularLocalizationAndMembraneBoundness',
                                                                     'localization membrane')
BasicExtractedAnnotations = collections.namedtuple('BasicExtractedAnnotations', 'DSSP3 DSSP8 disorder DSSP3_raw '
                                                                                'DSSP8_raw disorder_raw localization membrane')


[docs]class BasicAnnotationExtractor:
    necessary_files = ["secondary_structure_checkpoint_file", "subcellular_location_checkpoint_file"]

[docs]    def __init__(self, model_type: str, device: Union[None, str, torch.device] = None, **kwargs):
        """
        Initialize annotation extractor. Must define non-positional arguments for paths of files.

        :param secondary_structure_checkpoint_file: path of secondary structure inference model checkpoint file
        :param subcellular_location_checkpoint_file: path of the subcellular location inference model checkpoint file
        """

        self._options = kwargs
        self._model_type = model_type
        self._device = get_device(device)

        # Create un-trained (raw) model and ensure self._model_type is valid
        if self._model_type == "seqvec_from_publication":
            self._subcellular_location_model = SubCellFNN().to(self._device)
        elif self._model_type == "bert_from_publication" or self._model_type == "t5_xl_u50_from_publication":  # Drop batchNorm for ProtTrans models
            self._subcellular_location_model = SubCellFNN(use_batch_norm=False).to(self._device)
        else:
            raise NotImplementedError(f"You first need to define your custom model architecture {self._model_type}")

        # Download the checkpoint files if needed
        for file in self.necessary_files:
            if not self._options.get(file):
                self._options[file] = get_model_file(model=f"{self._model_type}_annotations_extractors", file=file)

        self._secondary_structure_checkpoint_file = self._options['secondary_structure_checkpoint_file']
        self._subcellular_location_checkpoint_file = self._options['subcellular_location_checkpoint_file']

        # Read in pre-trained model
        self._secondary_structure_model = SecStructCNN().to(self._device)

        # load pre-trained weights for annotation machines
        subcellular_state = torch.load(self._subcellular_location_checkpoint_file, map_location=self._device)
        secondary_structure_state = torch.load(self._secondary_structure_checkpoint_file, map_location=self._device)

        # load pre-trained weights into raw model
        self._subcellular_location_model.load_state_dict(subcellular_state['state_dict'])
        self._secondary_structure_model.load_state_dict(secondary_structure_state['state_dict'])

        # ensure that model is in evaluation mode (important for batchnorm and dropout)
        self._subcellular_location_model.eval()
        self._secondary_structure_model.eval()

[docs]    def get_subcellular_location(self, raw_embedding: ndarray) -> SubcellularLocalizationAndMembraneBoundness:
        raw_embedding = raw_embedding.astype(numpy.float32)  # For T5 fp16
        # Reduce embedding to fixed size, per-sequence (aka: Lx3x2014 --> 1024).
        # This is similar to embedder.reduce_per_protein(),
        # but more efficient since may be run in GPU (see self._device)

        # TODO: xxmh I forgot that SeqVec requires different pooling to derive fixed size rep.
        #   SeqVec requires summing over 3 layers, ProtTrans models only extract last layers
        #   Quick&Dirty solution is to check for shape of embedding tensors as SeqVec has 3 dims,
        #   while ProtTrans should only have 2 dims.
        #   Better way would be to access some internal variable (probably I just missed this flag)
        #   XXCD: can check embedder type via protol in embed config, but this may become complicated...
        if self._model_type == "seqvec_from_publication":
            # SeqVec case
            embedding = torch.tensor(raw_embedding).to(self._device).sum(dim=0).mean(dim=0, keepdim=True)
        elif self._model_type == "bert_from_publication" or self._model_type == "t5_xl_u50_from_publication":
            # Bert/T5 case
            embedding = torch.tensor(raw_embedding).to(self._device).mean(dim=0, keepdim=True)
        else:
            raise NotImplementedError

        yhat_loc, yhat_mem = self._subcellular_location_model(embedding)

        pred_loc = _location_labels[
            torch.max(yhat_loc, dim=1)[1].item()]  # get index of output node with max. activation,
        pred_mem = _membrane_labels[torch.max(yhat_mem, dim=1)[1].item()]  # this corresponds to the predicted class

        return SubcellularLocalizationAndMembraneBoundness(localization=pred_loc, membrane=pred_mem)

[docs]    def get_secondary_structure(self, raw_embedding: ndarray) -> BasicSecondaryStructureResult:
        raw_embedding = raw_embedding.astype(numpy.float32)  # For T5 fp16
        # same as for subcell loc.: SeqVec requires summing over layers while ProtTrans models only extract last layers
        if self._model_type == "seqvec_from_publication":
            # SeqVec case
            embedding = torch.tensor(raw_embedding).to(self._device).sum(dim=0, keepdim=True).permute(0, 2,
                                                                                                      1).unsqueeze(
                dim=-1)
        elif self._model_type == "bert_from_publication" or self._model_type == "t5_xl_u50_from_publication":
            # Bert/T5 case
            # Flip dimensions for ProtTrans models in order to make feature dimension the first dimension
            embedding = torch.tensor(raw_embedding).to(self._device).T[None, :, :, None]
        else:
            raise NotImplementedError

        yhat_dssp3, yhat_dssp8, yhat_disor = self._secondary_structure_model(embedding)

        pred_dssp3_raw = torch.softmax(yhat_dssp3, dim=1)[0]
        pred_dssp3 = self._class2label(_dssp3_labels, yhat_dssp3)

        pred_dssp8_raw = torch.softmax(yhat_dssp8, dim=1)[0]
        pred_dssp8 = self._class2label(_dssp8_labels, yhat_dssp8)

        pred_disor_raw = torch.softmax(yhat_disor, dim=1)[0]
        pred_disor = self._class2label(_disorder_labels, yhat_disor)

        return BasicSecondaryStructureResult(DSSP3=pred_dssp3, DSSP8=pred_dssp8, disorder=pred_disor,
                                             DSSP3_raw=pred_dssp3_raw, DSSP8_raw=pred_dssp8_raw,
                                             disorder_raw=pred_disor_raw)

[docs]    def get_annotations(self, raw_embedding: ndarray) -> BasicExtractedAnnotations:
        secstruct = self.get_secondary_structure(raw_embedding)
        subcell = self.get_subcellular_location(raw_embedding)

        return BasicExtractedAnnotations(disorder=secstruct.disorder, DSSP8=secstruct.DSSP8,
                                         DSSP3=secstruct.DSSP3, localization=subcell.localization,
                                         membrane=subcell.membrane, disorder_raw=secstruct.disorder_raw,
                                         DSSP3_raw=secstruct.DSSP3_raw, DSSP8_raw=secstruct.DSSP8_raw)

    @staticmethod
    def _class2label(label_dict: Dict[int, Any], yhat: torch.tensor) -> List[Any]:
        # get index of output node with max. activation (=predicted class)
        class_indices = torch.max(yhat, dim=1)[1].squeeze()
        return [label_dict[class_idx.item()] for class_idx in class_indices]