[docs]classProtTransXLNetUniRef100Embedder(EmbedderInterface):"""ProtTrans-XLNet-UniRef100 Embedder (ProtXLNet) Elnaggar, Ahmed, et al. "ProtTrans: Towards Cracking the Language of Life's Code Through Self-Supervised Deep Learning and High Performance Computing." arXiv preprint arXiv:2007.06225 (2020). https://arxiv.org/abs/2007.06225 """name="prottrans_xlnet_uniref100"embedding_dimension=1024number_of_layers=1_model:XLNetModel_model_fallback:Optional[XLNetModel]necessary_directories=["model_directory"]
[docs]def__init__(self,**kwargs):""" Initialize XLNet embedder. :param model_directory: """super().__init__(**kwargs)# Get file locations from kwargsself.model_directory=self._options["model_directory"]# 512 is from https://github.com/agemagician/ProtTrans/blob/master/Embedding/PyTorch/Advanced/ProtXLNet.ipynbself._model=(XLNetModel.from_pretrained(self.model_directory,mem_len=512).to(self._device).eval())self._model_fallback=None# sentence piece model# A standard text tokenizer which creates the input for NNs trained on text.# This one is just indexing single amino acids because we only have words of L=1.spm_model=str(Path(self.model_directory).joinpath("spm_model.model"))self._tokenizer=XLNetTokenizer.from_pretrained(spm_model,do_lower_case=False)