Source code for bio_embeddings.extract.basic.basic_annotation_extractor
import collections
import logging
from typing import List, Union, Dict, Any
import numpy
import torch
from numpy import ndarray
from bio_embeddings.extract.annotations import Location, Membrane, Disorder, SecondaryStructure
from bio_embeddings.extract.basic.annotation_inference_models import SubCellFNN, SecStructCNN
from bio_embeddings.utilities import get_device, get_model_file
logger = logging.getLogger(__name__)
# Label mappings
_location_labels = {
0: Location.CELL_MEMBRANE,
1: Location.CYTOPLASM,
2: Location.ENDOPLASMATIC_RETICULUM,
3: Location.GOLGI_APPARATUS,
4: Location.LYSOSOME_OR_VACUOLE,
5: Location.MITOCHONDRION,
6: Location.NUCLEUS,
7: Location.PEROXISOME,
8: Location.PLASTID,
9: Location.EXTRACELLULAR
}
_membrane_labels = {
0: Membrane.SOLUBLE,
1: Membrane.MEMBRANE
}
_dssp8_labels = {
0: SecondaryStructure.THREE_HELIX,
1: SecondaryStructure.ALPHA_HELIX,
2: SecondaryStructure.FIVE_HELIX,
3: SecondaryStructure.ISOLATED_BETA_BRIDGE,
4: SecondaryStructure.EXTENDED_STRAND,
5: SecondaryStructure.BEND,
6: SecondaryStructure.TURN,
7: SecondaryStructure.IRREGULAR
}
_dssp3_labels = {
0: SecondaryStructure.ALPHA_HELIX,
1: SecondaryStructure.EXTENDED_STRAND,
2: SecondaryStructure.IRREGULAR
}
_disorder_labels = {
0: Disorder.ORDER,
1: Disorder.DISORDER
}
BasicSecondaryStructureResult = collections.namedtuple('BasicSecondaryStructureResult', 'DSSP3 DSSP8 disorder '
'DSSP3_raw DSSP8_raw disorder_raw')
SubcellularLocalizationAndMembraneBoundness = collections.namedtuple('SubcellularLocalizationAndMembraneBoundness',
'localization membrane')
BasicExtractedAnnotations = collections.namedtuple('BasicExtractedAnnotations', 'DSSP3 DSSP8 disorder DSSP3_raw '
'DSSP8_raw disorder_raw localization membrane')
[docs]class BasicAnnotationExtractor:
necessary_files = ["secondary_structure_checkpoint_file", "subcellular_location_checkpoint_file"]
[docs] def __init__(self, model_type: str, device: Union[None, str, torch.device] = None, **kwargs):
"""
Initialize annotation extractor. Must define non-positional arguments for paths of files.
:param secondary_structure_checkpoint_file: path of secondary structure inference model checkpoint file
:param subcellular_location_checkpoint_file: path of the subcellular location inference model checkpoint file
"""
self._options = kwargs
self._model_type = model_type
self._device = get_device(device)
# Create un-trained (raw) model and ensure self._model_type is valid
if self._model_type == "seqvec_from_publication":
self._subcellular_location_model = SubCellFNN().to(self._device)
elif self._model_type == "bert_from_publication" or self._model_type == "t5_xl_u50_from_publication": # Drop batchNorm for ProtTrans models
self._subcellular_location_model = SubCellFNN(use_batch_norm=False).to(self._device)
else:
raise NotImplementedError(f"You first need to define your custom model architecture {self._model_type}")
# Download the checkpoint files if needed
for file in self.necessary_files:
if not self._options.get(file):
self._options[file] = get_model_file(model=f"{self._model_type}_annotations_extractors", file=file)
self._secondary_structure_checkpoint_file = self._options['secondary_structure_checkpoint_file']
self._subcellular_location_checkpoint_file = self._options['subcellular_location_checkpoint_file']
# Read in pre-trained model
self._secondary_structure_model = SecStructCNN().to(self._device)
# load pre-trained weights for annotation machines
subcellular_state = torch.load(self._subcellular_location_checkpoint_file, map_location=self._device)
secondary_structure_state = torch.load(self._secondary_structure_checkpoint_file, map_location=self._device)
# load pre-trained weights into raw model
self._subcellular_location_model.load_state_dict(subcellular_state['state_dict'])
self._secondary_structure_model.load_state_dict(secondary_structure_state['state_dict'])
# ensure that model is in evaluation mode (important for batchnorm and dropout)
self._subcellular_location_model.eval()
self._secondary_structure_model.eval()
[docs] def get_subcellular_location(self, raw_embedding: ndarray) -> SubcellularLocalizationAndMembraneBoundness:
raw_embedding = raw_embedding.astype(numpy.float32) # For T5 fp16
# Reduce embedding to fixed size, per-sequence (aka: Lx3x2014 --> 1024).
# This is similar to embedder.reduce_per_protein(),
# but more efficient since may be run in GPU (see self._device)
# TODO: xxmh I forgot that SeqVec requires different pooling to derive fixed size rep.
# SeqVec requires summing over 3 layers, ProtTrans models only extract last layers
# Quick&Dirty solution is to check for shape of embedding tensors as SeqVec has 3 dims,
# while ProtTrans should only have 2 dims.
# Better way would be to access some internal variable (probably I just missed this flag)
# XXCD: can check embedder type via protol in embed config, but this may become complicated...
if self._model_type == "seqvec_from_publication":
# SeqVec case
embedding = torch.tensor(raw_embedding).to(self._device).sum(dim=0).mean(dim=0, keepdim=True)
elif self._model_type == "bert_from_publication" or self._model_type == "t5_xl_u50_from_publication":
# Bert/T5 case
embedding = torch.tensor(raw_embedding).to(self._device).mean(dim=0, keepdim=True)
else:
raise NotImplementedError
yhat_loc, yhat_mem = self._subcellular_location_model(embedding)
pred_loc = _location_labels[
torch.max(yhat_loc, dim=1)[1].item()] # get index of output node with max. activation,
pred_mem = _membrane_labels[torch.max(yhat_mem, dim=1)[1].item()] # this corresponds to the predicted class
return SubcellularLocalizationAndMembraneBoundness(localization=pred_loc, membrane=pred_mem)
[docs] def get_secondary_structure(self, raw_embedding: ndarray) -> BasicSecondaryStructureResult:
raw_embedding = raw_embedding.astype(numpy.float32) # For T5 fp16
# same as for subcell loc.: SeqVec requires summing over layers while ProtTrans models only extract last layers
if self._model_type == "seqvec_from_publication":
# SeqVec case
embedding = torch.tensor(raw_embedding).to(self._device).sum(dim=0, keepdim=True).permute(0, 2,
1).unsqueeze(
dim=-1)
elif self._model_type == "bert_from_publication" or self._model_type == "t5_xl_u50_from_publication":
# Bert/T5 case
# Flip dimensions for ProtTrans models in order to make feature dimension the first dimension
embedding = torch.tensor(raw_embedding).to(self._device).T[None, :, :, None]
else:
raise NotImplementedError
yhat_dssp3, yhat_dssp8, yhat_disor = self._secondary_structure_model(embedding)
pred_dssp3_raw = torch.softmax(yhat_dssp3, dim=1)[0]
pred_dssp3 = self._class2label(_dssp3_labels, yhat_dssp3)
pred_dssp8_raw = torch.softmax(yhat_dssp8, dim=1)[0]
pred_dssp8 = self._class2label(_dssp8_labels, yhat_dssp8)
pred_disor_raw = torch.softmax(yhat_disor, dim=1)[0]
pred_disor = self._class2label(_disorder_labels, yhat_disor)
return BasicSecondaryStructureResult(DSSP3=pred_dssp3, DSSP8=pred_dssp8, disorder=pred_disor,
DSSP3_raw=pred_dssp3_raw, DSSP8_raw=pred_dssp8_raw,
disorder_raw=pred_disor_raw)
[docs] def get_annotations(self, raw_embedding: ndarray) -> BasicExtractedAnnotations:
secstruct = self.get_secondary_structure(raw_embedding)
subcell = self.get_subcellular_location(raw_embedding)
return BasicExtractedAnnotations(disorder=secstruct.disorder, DSSP8=secstruct.DSSP8,
DSSP3=secstruct.DSSP3, localization=subcell.localization,
membrane=subcell.membrane, disorder_raw=secstruct.disorder_raw,
DSSP3_raw=secstruct.DSSP3_raw, DSSP8_raw=secstruct.DSSP8_raw)
@staticmethod
def _class2label(label_dict: Dict[int, Any], yhat: torch.tensor) -> List[Any]:
# get index of output node with max. activation (=predicted class)
class_indices = torch.max(yhat, dim=1)[1].squeeze()
return [label_dict[class_idx.item()] for class_idx in class_indices]