Source code for bio_embeddings.embed.word2vec_embedder

import re

import numpy as np
from gensim.models.keyedvectors import KeyedVectors
from numpy import ndarray

from bio_embeddings.embed.embedder_interfaces import EmbedderInterface


[docs]class Word2VecEmbedder(EmbedderInterface): name = "word2vec" embedding_dimension = 512 number_of_layers = 1 necessary_files = ["model_file"]
[docs] def __init__(self, **kwargs): """ :param model_file: path of model file. If not supplied, will be downloaded. """ super().__init__(**kwargs) self._model_file = self._options.get("model_file") self._model = KeyedVectors.load(str(self._model_file), mmap="r") self._vector_size = 512 self._zero_vector = np.zeros(self._vector_size, dtype=np.float32) self._window_size = 3
[docs] def embed(self, sequence: str) -> ndarray: sequence = re.sub(r"[UZOB]", "X", sequence) # pad sequence with special character (only 3-mers are considered) padded_sequence = "-" + sequence + "-" # container embedding = np.zeros((len(sequence), self._vector_size), dtype=np.float32) # for each aa in the sequence, retrieve k-mer for index in range(len(padded_sequence)): try: k_mer = "".join(padded_sequence[index : index + self._window_size]) embedding[index, :] = self._get_kmer_representation(k_mer) # end of sequence reached except IndexError: return embedding
def _get_kmer_representation(self, k_mer): # try to retrieve embedding for k-mer try: return self._model.wv[k_mer] # in case of padded or out-of-vocab character except KeyError: # if single AA was not part of corpus (or no AA) if len(k_mer) <= 1: return self._zero_vector # handle border cases at start/end of seq elif "-" in k_mer: idx_center = int(len(k_mer) / 2) return self._get_kmer_representation(k_mer[idx_center])
[docs] @staticmethod def reduce_per_protein(embedding: ndarray) -> ndarray: return embedding.mean(axis=0)