[docs]classPLUSRNNEmbedder(EmbedderInterface):"""PLUS RNN Embedder Pre-Training of Deep Bidirectional Protein Sequence Representations with Structural Information Seonwoo Min, Seunghyun Park, Siwon Kim, Hyun-Soo Choi, Sungroh Yoon https://arxiv.org/abs/1912.05625"""name="plus_rnn"number_of_layers=1embedding_dimension=1024necessary_files=["model_file"]_alphabet:Protein_model:PLUS_RNN_model_cfg:ModelConfig_run_cfg:RunConfig
[docs]def__init__(self,device:Union[None,str,torch.device]=None,**kwargs):super().__init__(device,**kwargs)# This seed is copied from PLUSset_seeds(2020)# We inlined the config json files since they aren't shipped with the packageself._alphabet=Protein()self._model_cfg=ModelConfig(input_dim=len(self._alphabet))self._model_cfg.model_type="RNN"self._model_cfg.rnn_type="B"self._model_cfg.num_layers=3self._model_cfg.hidden_dim=512self._model_cfg.embedding_dim=100self._run_cfg=RunConfig(sanity_check=True)self._run_cfg.batch_size_eval=512self._model=PLUS_RNN(self._model_cfg)self._model.load_weights(self._options["model_file"])self._model=self._model.to(self._device)
[docs]defembed_batch(self,batch:List[str])->Generator[ndarray,None,None]:sequences=[self._alphabet.encode(sequence.encode().upper())forsequenceinbatch]test_dataset=[torch.from_numpy(sequence).long()forsequenceinsequences]test_dataset=Embedding_dataset(test_dataset,self._alphabet,self._run_cfg,True)iterator_test=DataLoader(test_dataset,self._run_cfg.batch_size_eval,collate_fn=collate_sequences_for_embedding,)model_list=[self._model,"",True,False,False]tasks_list=[["",[],[]]]# list of lists [idx, metrics_train, metrics_eval]trainer=Trainer([model_list],get_embedding,self._run_cfg,tasks_list)fortokens,lengthsiniterator_test:# https://github.com/pytorch/pytorch/issues/43227batch=(tokens.to(self._device),lengths)trainer.embed(batch,{"data_parallel":False})embeddings=trainer.tasks_dict["results_eval"][0]["embeddings"]# 1 is d_h with 1024 dimensionsforiinrange(len(embeddings[0])):yieldembeddings[1][i].numpy()trainer.reset()