[docs]defget_device(device:Union[None,str,torch.device]=None)->torch.device:"""Returns what the user specified, or defaults to the GPU, with a fallback to CPU if no GPU is available."""ifisinstance(device,torch.device):returndeviceelifdevice:returntorch.device(device)eliftorch.cuda.is_available():returntorch.device("cuda")else:returntorch.device("cpu")
[docs]defcheck_required(params:dict,keys:List[str]):""" Verify if required set of parameters is present in configuration Parameters ---------- params : dict Dictionary with parameters keys : list-like Set of parameters that has to be present in params Raises ------ MissingParameterError """missing=[kforkinkeysifknotinparams]iflen(missing)>0:raiseMissingParameterError("Missing required parameters: {}\nGiven: {}".format(", ".join(missing),params))
[docs]defread_fasta(path:str)->List[SeqRecord]:""" Helper function to read FASTA file. :param path: path to a valid FASTA file :return: a list of SeqRecord objects. """try:returnlist(SeqIO.parse(path,"fasta"))exceptFileNotFoundError:raise# Already says "No such file or directory"exceptExceptionase:raiseValueError(f"Could not parse '{path}'. Are you sure this is a valid fasta file?")frome
[docs]defreindex_sequences(sequence_records:List[SeqRecord],simple=False)->(SeqRecord,DataFrame):""" Function will sort and re-index the sequence_records IN PLACE! (change the original list!). Returns a DataFrame with the mapping. :param sequence_records: List of sequence records :param simple: Boolean; if set to true use numerical index (1,2,3,4) instead of md5 hash :return: A dataframe with the mapping with key the new ids and a column "original_id" containing the previous id, and the sequence length. """sequence_records[:]=sorted(sequence_records,key=lambdaseq:-len(seq))original_ids=[s.idforsinsequence_records]ifsimple:new_ids=list()forid,recordinenumerate(sequence_records):record.id=str(id)new_ids.append(str(id))else:sequence_records[:]=map(_assign_hash,sequence_records)new_ids=[s.idforsinsequence_records]df=DataFrame(zip(original_ids,[len(seq)forseqinsequence_records]),columns=["original_id","sequence_length"],index=new_ids,)returndf
[docs]defreindex_h5_file(h5_file_path:str,mapping_file_path:str):""" Will rename the dataset keys using the "original_id" from the mapping file. This operation is generally considered unsafe, as the "original_id" is unsafe (may contain invalid characters, duplicates, or empty strings). Some sanity checks are performed before starting the renaming process, but generally applying this function is discouraged unless you know what you are doing. :param h5_file_path: path to the hd5_file to re-index :param mapping_file_path: path to the mapping file (this must have the first column be the current keys, and a column "original_id" as the new desired id) :return: Nothing -- conversion happens in place! """mapping_file=read_csv(mapping_file_path,index_col=0)mapping_file.index=mapping_file.index.map(str)mapping_file['original_id']=mapping_file['original_id'].astype(str)conversion_table=list(zip(mapping_file.index.values,mapping_file['original_id'].values))unique_froms=set([e[0]foreinconversion_table])unique_tos=set([e[1]foreinconversion_tableife])iflen(unique_froms)!=len(unique_tos):raiseConversionUniqueMismatch(f"Conversion unique count mismatch.\n"f"Your mapping file contains {len(unique_froms)} unique ids, which you are truing to convert to {len(unique_tos)} unique original_ids.\n"f"These numbers *must* match. You likely have: duplicate original_id's, or empty strings in original_id.")withh5py.File(h5_file_path,"r+")ash5_file:keys_set=set(h5_file.keys())unchanged_set=keys_set-unique_fromsiflen(unchanged_set)>0:logger.warning(f"There are some keys in your h5 file which won't be re-indexed!\n"f"These are: {unchanged_set}.")changeable_set=unique_froms.union(keys_set)iflen(changeable_set)==0:logger.info("Nothing was re-indexed.")else:logger.info(f"Reindexing the following keys: {changeable_set}")for(from_index,to_index)infilter(lambdaitem:item[0]inkeys_set,conversion_table):h5_file.move(from_index,to_index)
defremove_identifiers_from_annotations_file(faulty_identifiers:list,annotation_file_path:str)->DataFrame:""" Removes id :param faulty_identifiers: a list of identifiers :param annotation_file_path: a str detailing the path :return: a new DataFrame with the annotations removed """annotation_file=read_csv(annotation_file_path)returnannotation_file[annotation_file['identifier'].isin(set(annotation_file['identifier'].values)-set(faulty_identifiers))]
[docs]classQueryEmbeddingsFile:""" A helper class that allows you to retrieve embeddings from an embeddings file based on either the `original_id` (extracted from the FASTA header during the embed stage), or via the `new_id` (assigned during the embed stage, either an MD5 hash of the input sequence, or an integer (if `remapping_simple: True`). Available for embeddings created with the pipeline starting with v0.1.5 .. code-block:: python import h5py from bio_embeddings.utilities import QueryEmbeddingsFile with h5py.File("path/to/file.h5", "r") as file: embedding_querier = QueryEmbeddingsFile(file) print(embedding_querier.query_original_id("Some_Database_ID_1234").mean()) """
[docs]def__init__(self,embeddings_file:h5py.File):""" :param embeddings_file: an h5py File, aka `h5py.File("/path/to/file.h5")`. """self._lookup_table=dict((embeddings_file[new_id].attrs["original_id"],new_id)fornew_idinembeddings_file.keys())self._embeddings_file=embeddings_file
[docs]defquery_original_id(self,original_id:str)->np.array:""" Query embeddings file using the original id, aka. the string extracted from the FASTA header of the sequence. :param original_id: a string representing the id extracted from the FASTA header :return: the embedding as a numpy array """returnnp.array(self._embeddings_file[self._lookup_table[original_id]])
[docs]defquery_new_id(self,new_id:str)->np.array:""" Query embeddings file using the new id, aka. either the MD5 hash of the sequence or a number. :param new_id: a string representing the new id. :return: the embedding as a numpy array """returnnp.array(self._embeddings_file[new_id])
[docs]defread_mapping_file(mapping_file:str)->DataFrame:"""Reads mapping_file.csv and ensures consistent types"""# We want to read the unnamed column 0 as str (esp. with simple_remapping), which requires some workarounds# https://stackoverflow.com/a/29793294/3549270mapping_file=read_csv(mapping_file,index_col=0)mapping_file.index=mapping_file.index.astype("str")returnmapping_file