Pipeline Reference

Reference with all the possible options for the pipeline configuration file (“config.yml”), which uses the yaml format.

# global options must be defined

global:
  # Required: Path to a Fasta file
  sequences_file: /path/to/sequences.fasta
  # Required: String for output
  prefix: my_embeddings

  ## Optional: file manager
  # file_manager: [*filesystem]

  ## Optional: remap index simple (not via md5, this is not encouraged)
  # simple_remapping: [True, *False]

  ## Stages are executed in sequential order as they are outlined in this file.
  ## Stage names must be different!! If not: they will overwrite each_other
  ## The same stage type (e.g. embed,..) can be executed multiple types.
  ## Dependencies for a stage are defined in the dependencies parameter

  ## This config file includes options for initializing classes and options specific to the protocol

  ## Options notation:
  ## *: denotes the default option
  ## @: denotes that the file or directory will be downloaded and stored locally if not provided

stage_0:
  type: align
  # Required: which protocol to use.
  # Options: mmseqs_search, deepblast
  protocol: mmseqs_search

  # Required for mmseqs_search: search sequence to search against (can be FASTA, mmseqs db or mmseqs profile)
  search_sequences_file: /path/to/a/sequence/db.fasta
  # search_sequences_directory: /path/to/a/sequence/db/
  # search_profiles_directory: /path/to/a/profiles/db/

  # Optional for mmseqs_search: convert alignment to profile
  # convert_to_profiles: [True,*False]

  # Optional for mmseqs_search: alignment parameters (check bio_embeddings/align/mmseqs2.py#13 for available options)
  # mmseqs_search_options:
  #   num_iterations: 5
  #   sensitivity: 7.5
  #   minimum_sequence_identity: 0.2
  #   maximum_number_of_prefilter_sequences: 100
  #   alignment_output: True

stage_1:
  type: embed
  # Required: which embedder to use
  # Options: seqvec, prottrans_albert_bfd, prottrans_bert_bfd, prottrans_t5_bfd, prottrans_t5_uniref50,
  # prottrans_t5_xl_u50, prottrans_xlnet_uniref100, cpcprot, esm, esm1b, esm1v, plus_rnn, unirep, bepler
  protocol: seqvec

  # Optional: reduce embeddings to fixed size, per-protein. Comment out if not needed.
  # Note that you can always compute the reduced embeddings from the full embeddings
  # but some further stages (e.g. unsupervised extract) need this option
  reduce: True

  # Optional: discard per amino acid embeddings.
  # Setting this parameter to True will disable storing full size embeddings (per amino acid).
  # This parameter only works in combination with `reduce: True` or `embeddings_transformer_function`.
  # discard_per_amino_acid_embeddings: [True, *False]

  # Optional/Advanced: apply a transformation on the per-amino-acid embeddings
  # === This is an advanced parameter ===
  # This parameter will be "eval"-uated. It must be a callable. Most likely, you'll want to define lambda functions.
  # The input of the function is a per-amino-acid embedding (aka. an np array of shape n_layers*embedding_dimension*sequence_length).
  # You can use numpy functions via "np".
  # The result of the transformation will be stored in the transformed_embeddings_file.
  # The pipeline *WILL NOT* check file size of the prospective transformed_embeddings_file.
  # This parameter can be used in conjunction with `discard_per_amino_acid_embeddings`.
  # Two examples:
  #   - "lambda x: x[0].mean(0)" --> for SeqVec, this will return the mean pooled embedding of the first layer
  #   - "lambda x: x.max(0)" --> for ProtTrans-BERT-BFD, this will return the max pooled embedding, instead of mean pooled
  # embeddings_transformer_function:

  ## Mandatory for esm1v: This defines which of the five models of the ensemble will be used
  # ensemble_id: [1,2,3,4,5]

  #### Optional parameters to instantiate classes

  ### Optional for protocol: seqvec
  # weights_file: @/path/to/file
  # options_file: @/path/to/file
  ## The following parameter sets an upper bound on total AA to include when embedding many sequences.
  ## Adjust this parameter if CUDA runs out of memory! The default (15.000) works for a 1080 with 8GB RAM.
  # max_amino_acids: [*15000]
  ## The following parameters sets the amount of AA to include in a batch before writing to disk.

  ### Optional for protocol: fasttext, word2vec, glove, esm
  # model_file: @/path/to/file

  ### Optional for protocol: albert, bert, xlnet
  # model_directory: @/path/to/directory

  ### Optional for protocol: seqvec, prottrans_albert_bfd, prottrans_bert_bfd, prottrans_xlnet_uniref100, esm
  ## Set the following parameter to calculate embeddings on a specific device or a specific GPU on multi-GPU hosts.
  ## The default, "cuda", runs on the default GPU. You can switch to CPU with "cpu" or select a GPU on
  ## multi-GPU systems with "cuda:0" or "cuda:1" etc.
  ## See https://pytorch.org/docs/stable/tensor_attributes.html?highlight=device#torch.torch.device
  ## for a complete list
  ## unirep will use the cpu by default; Follow https://github.com/google/jax#pip-installation
  ## to run it on the gpu
  # device: cuda

  ## Save numbers with lower precision, so that they take only about half the
  ## storage space (replace float32 with float16).
  ## This makes predictions based on the embeddings less exact.
  # half_precision: [true, *false]

  ## For prottrans_t5_bfd, prottrans_t5_uniref50 and prottrans_t5_xl_u50 only:
  # Use the model in half precision mode (float16)
  ## We recommend activating this with T5, since tested GPU (Quadro RTX 3000) reduces memory consumption
  ## from 12GB to 7GB while the effect in benchmarks is negligible (±0.1 percentages points in different sets,
  ## generally below standard error)
  # half_precision_model: [true, *false]

stage_2:
  type: project
  # Required: which projection algorithm to use
  # Options: tsne, umap, pb_tucker
  protocol: tsne
  # Either depend on an embedding stage with reduced embeddings
  depends_on: stage_1
  # or define mapping and reduced embedding file:
  # reduced_embeddings_file: path/to/reduced_embeddings_file.h5
  # mapping_file: path/to/mapping_file.csv


  ### Optional for protocol: tsne
  # n_iter: *15000
  # perplexity: *6
  # n_jobs: *-1

  ### Optional for protocol: umap
  # min_dist: *0.6
  # spread: *1
  # n_neighbors: *15

  ### Optional for protocol: tsne and umap
  # metric: *'cosine'
  # n_components: *3
  # random_state: *420
  # verbose: *1

  ## Optional with pb_tucker: Postprocess reduced embeddings with tucker, which is a contrastive
  ## learning model trained to distinguish CATH superfamilies. It reduces the dimensionality from 1024 to 128.
  # model_file: @/path/to/file

stage_3:
  type: mutagenesis
  ## Required: Which language model to use
  ## Options: protbert_bfd_mutagenesis
  protocol: protbert_bfd_mutagenesis
  ## Optional: temperature for softmax, see https://arxiv.org/abs/1503.02531
  # temperature: [1*]

  ## Optional: Since we're running ProtBert, the common ProtBert options explained in `extract` are supported
  # model_directory: @/path/to/directory
  # device: cuda
  # half_precision: [true, *false]
  # half_precision_model: [true, *false]

stage_4:
  type: visualize
  ## Required: which graph to render
  ## Options: plotly, plot_mutagenesis
  protocol: plotly
  ## For plotly: Either depend on a project stage with projected embeddings file
  depends_on: stage_2
  ## or define projected_reduced_embeddings_file:
  # projected_reduced_embeddings_file: path/to/projected_reduced_embeddings_file.h5
  ## For plot_mutagenesis: Either depend on a mutagenesis stage
  # depends_on: stage_3
  ## or define residue_probabilities_file:
  # residue_probabilities_file: path/to/residue_probabilities_file.csv

  ## Optional for plotly: csv file with annotations
  ## csv must have header: identifier, label
  # annotation_file: path/to/annotation_file.csv

  ## Optional for plotly: hide proteins for which there is no annotation in the annotation file (only relevant if annotation file is provided)
  # display_unknown: [False, *True]

  ## Optional for plotly: set to True if in annotation_file identifiers correspond to sequence MD5 hashes
  ## if set to False (default), mapping will be performed on original identifiers.
  ## Where missing or duplicate, will be ignored
  # merge_via_index: [True, *False]

  ## Optional for plotly: 2D vs 3D plot
  # n_components: [2,*3]

stage_5:
  type: extract
  ## Required: which method to use.
  ## Current options:
  ##   - seqvec_from_publication (Uses models evaluated in https://doi.org/10.1186/s12859-019-3220-8 )
  ##   - bert_from_publication (Uses models evaluated in https://doi.org/10.1101/2020.07.12.199554 )
  ##   - unsupervised (Uses concepts presented in https://github.com/Rostlab/goPredSim )
  protocol: seqvec_from_publication

  ## The supervised extract (bert_from_publication and seqvec_from_publication) needs
  ## the full embeddings, i.e. `discard_per_amino_acid_embeddings` must be false in the
  ## embed stage, which is the default. For the unsupervised extract, you need the
  ## reduced embeddings, i.e. set `reduce: True`.
  ## Instead of an embed stage, you can also provide a file manually, either supply for unsupervised:
  # reduced_embeddings_file:
  ## or for bert_from_publication and seqvec_from_publication
  # embeddings_file:
  depends_on: stage_2

  ## Optional for protocol: seqvec_from_publication, bert_from_publication,
  ## will be downloaded if not supplied
  # secondary_structure_checkpoint_file: path/to/checkpoint.pt
  # subcellular_location_checkpoint_file: path/to/checkpoint.pt

  ## Required for protocol: unsupervised
  # reference_embeddings_file: path/to/embeddings.hd5
  # reference_annotations_file: path/to/annotation_file.csv

  ## Optional for protocol: unsupervised
  ## The following two options refer to the pairwise_distance function of scikit-learn 0.23.2
  ## https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html#sklearn.metrics.pairwise_distances
  # n_jobs: [1*]
  # metric: [euclidean*]
  ## The following will define how many neighbours to consider to transfer annotations.
  ## k = 1 means transfer the annotations from the nearest neighbor
  ## k > 1 will result in merging the annotations of all k > 1 neighbors onto the target embedding
  # k_nearest_neighbours: [1*]
  ## The following informs the pipeline whether you want to keep the pairwise distance matrix file.
  ## This is a CSV containing pairwise distances between query and reference embeddings using your metric.
  ## The file can become quite big (e.g. UniProt Human (query) vs. SwissProt (reference) results in 45GB)
  ## By default, this file will be discarded, but you can decide to keep it to perform other calculations.
  # keep_pairwise_distances_matrix_file: [*False]