[docs]defpairwise_distance_matrix_from_embeddings_and_annotations(query_embeddings_path:str,reference_embeddings_path:str,metric:str="euclidean",n_jobs:int=1)->PairwiseDistanceMatrixResult:""" :param n_jobs: int, see scikit-learn documentation :param metric: Metric to use (string!), see scikit-learn documentation :param query_embeddings_path: A string defining a path to an h5 file :param reference_embeddings_path: A string defining a path to an h5 file :return: A tuple containing: - pairwise_matrix: the pairwise distances between queries and references - queries: A list of strings defining the queries - references: A list of strings defining the references """references:List[str]queries:List[str]reference_embeddings=list()query_embeddings=list()withh5py.File(reference_embeddings_path,'r')asreference_embeddings_file,\
h5py.File(query_embeddings_path,'r')asquery_embeddings_file:references=list(reference_embeddings_file.keys())queries=list(query_embeddings_file.keys())forrefereince_identifierinreferences:reference_embeddings.append(np.array(reference_embeddings_file[refereince_identifier]))forquery_identifierinqueries:query_embeddings.append(np.array(query_embeddings_file[query_identifier]))pairwise_distances=_pairwise_distances(query_embeddings,reference_embeddings,metric=metric,n_jobs=n_jobs)returnPairwiseDistanceMatrixResult(pairwise_matrix=pairwise_distances,queries=queries,references=references)
[docs]defget_k_nearest_neighbours(pairwise_matrix:np.array,k:int=1)->Tuple[List[int],np.ndarray]:""" :param pairwise_matrix: an np.array with columns as queries and rows as targets :param k: the number of k-nn's to return :return: a list of tuples with indices of the nearest neighbour and distance to them (sorted by distance asc.) """resulting_indices=list()resulting_distances=list()fori,neighbour_distancesinenumerate(pairwise_matrix):nearest_neighbour_indices=np.argpartition(neighbour_distances,k)[:k]nearest_neighbour_distances=np.array(list(map(neighbour_distances.__getitem__,nearest_neighbour_indices)))# nearest_neighbours will appear in an arbitrary order.# We want to ensure that the distances and indices are sorted by ascending distance# The following code shuffles both lists around to make sure that indices and distances are sorted equallynearest_neighbour_distances,nearest_neighbour_indices=(list(t)fortinzip(*sorted(zip(nearest_neighbour_distances,nearest_neighbour_indices))))resulting_indices.append(nearest_neighbour_indices)resulting_distances.append(nearest_neighbour_distances)returnresulting_indices,np.array(resulting_distances)