Source code for bio_embeddings.embed.bepler_embedder
"""Most of this implementation is taken fromhttps://github.com/tbepler/protein-sequence-embedding-iclr2019/blob/3bb338bd70e2b7b97c733304d50cfcac9c35cb27/embed_sequences.py---Supporting torch > 1.3 was a bit tricky (https://github.com/tbepler/protein-sequence-embedding-iclr2019/issues/21).Here's what I did:First, download and unpack the weights to `pretrained_weights`.```wget http://bergerlab-downloads.csail.mit.edu/bepler-protein-sequence-embeddings-from-structure-iclr2019/pretrained_models.tar.gztar xf pretrained_models.tar.gz```Create a torch 1.3 virtualenv (with python 3.7). In this venv, run:```shell_scriptgit clone https://github.com/tbepler/protein-sequence-embedding-iclr2019cd protein-sequence-embedding-iclr2019pip install Cythonpython setup.py install``````pythonimport torchmodel = torch.load("pretrained_models/ssa_L1_100d_lstm3x512_lm_i512_mb64_tau0.5_lambda0.1_p0.05_epoch100.sav")# For some reason torch seems to have missed that in older versions, but requires it in newer onesstate_dict = model.state_dict()state_dict["scop_predict.gap"] = torch.FloatTensor([-10])torch.save(state_dict, "pretrained_models/ssa_L1_100d_lstm3x512_lm_i512_mb64_tau0.5_lambda0.1_p0.05_epoch100_updated.state_dict")print(model)```I then switched back to my normal torch 1.5.1 environment and recreated theprinted model params as you can see in the __init__ function."""fromtypingimportUnionimportnumpyimporttorchfrombepler.alphabetsimportUniprot21frombepler.models.embeddingimportStackedRNNfrombepler.models.multitaskimportSCOPCMfrombepler.models.sequenceimportBiLMfromnumpyimportndarrayfromtorchimportnnfrombio_embeddings.embedimportEmbedderInterfacedef_unstack_lstm(lstm,device:torch.device):in_size=lstm.input_sizehidden_dim=lstm.hidden_sizelayers=[]foriinrange(lstm.num_layers):layer=nn.LSTM(in_size,hidden_dim,batch_first=True,bidirectional=True)layer.to(device)attributes=["weight_ih_l","weight_hh_l","bias_ih_l","bias_hh_l"]forattrinattributes:dest=attr+"0"src=attr+str(i)getattr(layer,dest).data[:]=getattr(lstm,src)dest=attr+"0_reverse"src=attr+str(i)+"_reverse"getattr(layer,dest).data[:]=getattr(lstm,src)layer.flatten_parameters()layers.append(layer)in_size=2*hidden_dimreturnlayers
[docs]classBeplerEmbedder(EmbedderInterface):"""Bepler Embedder Bepler, Tristan, and Bonnie Berger. "Learning protein sequence embeddings using information from structure." arXiv preprint arXiv:1902.08661 (2019). """name="bepler"embedding_dimension=121# 100 + len(self.alphabet)number_of_layers=1# This is derived from ssa_L1_100d_lstm3x512_lm_i512_mb64_tau0.5_lambda0.1_p0.05_epoch100.sav# See text at the top of the filenecessary_files=["model_file"]
[docs]def__init__(self,device:Union[None,str,torch.device]=None,**kwargs):super().__init__(device,**kwargs)self.alphabet=Uniprot21()# These parameters are part of the model, but we can't load them if we# use the state dicthidden=512out=100lm=BiLM(len(self.alphabet)+1,len(self.alphabet),len(self.alphabet),hidden*2,2,)embedding=StackedRNN(len(self.alphabet),hidden,hidden,out,nlayers=3,dropout=0,lm=lm,)self.model=SCOPCM(embedding)self.model.load_state_dict(torch.load(self._options["model_file"]))self.model=self.model.eval().to(self._device)self.lstm_stack=_unstack_lstm(self.model.embedding.rnn,self._device)
[docs]defembed(self,sequence:str)->ndarray:# https://github.com/sacdallago/bio_embeddings/issues/116ifnotsequence:returnnumpy.zeros((0,self.embedding_dimension))x=sequence.upper().encode()# convert to alphabet indexx=self.alphabet.encode(x)x=torch.from_numpy(x).to(self._device)# embed the sequencewithtorch.no_grad():x=x.long().unsqueeze(0)zs=[]# noinspection PyUnresolvedReferencesx_onehot=x.new(x.size(0),x.size(1),21).float().zero_()x_onehot.scatter_(2,x.unsqueeze(2),1)zs.append(x_onehot)h=self.model.embedding.embed(x)forlstminself.lstm_stack:h,_=lstm(h)h=self.model.embedding.proj(h.squeeze(0)).unsqueeze(0)zs.append(h)z1=torch.cat(zs,2)z=z1returnz.squeeze(0).cpu().numpy()