[docs]classCPCProtEmbedder(EmbedderInterface):"""CPCProt Embedder Lu, Amy X., et al. "Self-supervised contrastive learning of protein representations by mutual information maximization." bioRxiv (2020). https://doi.org/10.1101/2020.09.04.283929 """name="cpcprot"embedding_dimension=512number_of_layers=1necessary_files=["model_file"]
[docs]def__init__(self,device:Union[None,str,torch.device]=None,**kwargs):super().__init__(device,**kwargs)self.tokenizer=Tokenizer(vocab="iupac")# If we don't do this here, CPCProtModel will end up on the gpu if one is# available, even if we passed the cpu as device.# Afaik this is the best way to derive from DEFAULT_CONFIGdict_cfg=DEFAULT_CONFIG.to_dict()dict_cfg["use_cuda"]=self._device.type=="cuda"raw_model=CPCProtModel(cfg=CPCProtConfig.from_dict(dict_cfg)).to(self._device)state_dict=dict(torch.load(self._options["model_file"],map_location=self._device))foriinlist(state_dict.keys()):ifi.startswith("module."):state_dict[i[7:]]=state_dict[i]delstate_dict[i]raw_model.load_state_dict(state_dict)self._model=CPCProtEmbedding(raw_model.to(self._device).eval())
[docs]defembed_batch(self,batch:List[str])->Generator[ndarray,None,None]:"""See https://github.com/amyxlu/CPCProt/blob/df1ad1118544ed349b5e711207660a7c205b3128/embed_fasta.py"""encoded=[numpy.array(self.tokenizer.encode(sequence))forsequenceinbatch]# 11 is the minimum patch size, so we need to zero-pad shorter sequencespad_length=max(max([i.shape[0]foriinencoded]),11)padded=[numpy.pad(i,(0,pad_length-i.shape[0]))foriinencoded]torch_inputs=torch.from_numpy(numpy.array(padded))yield fromself._model.get_z_mean(torch_inputs).detach().cpu().numpy()