Pipeline & Module YAML Creation

Modified on Thu, 26 Sep, 2024 at 9:29 AM

Demo Module YAML , configure it based on your NF pipeline

name: testname  # Module name 
type: testtype # Module type 
version: v1.0.0
path: /home/ec2-user/bioinfo/infra/repository
command: 
    ## Install Java and Nextflow
    # This section install the required versions of java and nextflow
    - /home/ec2-user/bioinfo/infra/storage/sync_reflib.py -k reflib/tool/install-java-on-worker.sh -d /home/ec2-user/tools/ -f 
    - sudo chmod +x /home/ec2-user/tools/install-java-on-worker.sh && sudo sh /home/ec2-user/tools/install-java-on-worker
    - export NXF_VER=23.10.0 # Choose nextflow version to use 
    - wget -qO- <https://get.nextflow.io> | bash 
    - sudo mv nextflow /usr/bin/ 
    ## Set Up AWS Variables 
    # This section configure the scripts to use your AWS credential
    - REGION=$(jq '.repository.settings.region' /home/ec2-user/.bpconfig.json | sed 's/"//g')
    ## For Private Git Repo
    - python3.8 {{path}}/github_integration.py --url https://github.com/gitaccount/reponame.git --clonedir {{basedir}}/reponame/ --branch {{nfcode_version}} && cd {{basedir}}/reponame/ 
      # optional params --branch [release tag / branch name]
    ## For Public Git Repo (Note:comment above private git repo)
    # git clone -b {{nfcode_version}} --single-branch git@github.com:gitaccount/reponame.git {{basedir}}/reponame/ && cd {{basedir}}/reponame/
    ## SampleSheet Creation 
    # Below script will create a sample sheet csv input file , pass the column header name in below arguments as your nf scripts expected :- 
    # 1. --samplename { colname for samples to prepare samplesheet }
    # 2. --forcolname { colname for forward fastq to prepare samplesheet }
    # 3. --revcolname { colname for reverse fastq to prepare samplesheet}
    # 4. --extra_columnn value --extra_datan value { colname for extra column and data value where can be integer } 
    - |
       /home/ec2-user/bioinfo/bioinfo/nf_samplesheet.py --sampleids {{all_sample_ids}} --samplename 'sample' --forcolname '' --revcolname '' --extra_column1 '' --extra_data1 '' --outdir {{basedir}}/ 
    ## Run Nextflow CMD 
    # Add required params to nextflow run command with default values as well as add them to Inputs section so user can change values at the analysis page. 
    - |
       nextflow run main.nf -profile docker --input {{basedir}}/samplesheet.csv  --outdir {{basedir}}/results/ --max_cpus {{max_cpus}} 
    # NOTE:- To run on AWS batch add --profile awsbatch,docker --awsregion $REGION --awsqueue job-queue-prod
 
# This section includes the inputs/params (i.e str,int,float,option type) user can provide to command section
inputs:
  nfcode_version:
    val: v1.0.1
    type: option
    show: true # make it false if don't want to show this at analysis page
    label: nf code pipeline version
    options:
      - v1.0.0
      - v1.0.1
      - v1.0.2
      - v1.0.3
    help: "select version to run nf pipeline"
  max_cpus:
    val: 24
    type: int
    min: 8
    max: 36
    label: Max CPU
    show: false
  
## This section includes the outputs (i.e file,zip,folder) options to save the results and display them at analysis page
# Add or remove outputs filed based on pipeline requirements. 
outputs:
  input_csv:
    type: file
    val: ''  
    action: template
    formula: _{{basedir}}/samplesheet.csv
  summary_html:
    type: file
    val: ''
    action: template
    formula: _{{basedir}}/results/summary_report_final.html
    tags: [report, html]
  zip_outdir:
    type: file
    val: 'Output.zip'
    action: template
    dir_action: template
    dir_formula: _{{basedir}}/results/
    dir_val: _{{basedir}}/results/
    formula: _{{basedir}}/Output.zip

Push Module YAML to basepair database

# Module create command creates a new module in the BP database and returns id which needs to be added to the top of module YAML eg. id: 12345.
basepair module create --file ~/pathtomodule/modulename.yaml 

# Module update command if changes done at module level needs to update on BP database.
basepair module update --file ~/pathtomodule/modulename.yaml

Demo Pipeline YAML, configure it based on your NF pipeline requirements

name: 'pipelinename' #name of pipeline
summary: |
         'Summary about pipeline'
description: |
         'Description about pipeline'
datatype: dna-seq # {atac-seq,chip-seq,crispr,cutnrun,cutntag,dna-seq,other,panel,rna-seq,scrna-seq,small-rna-seq,snap-chip,wes,wgs}
visibility: private # choose between public/private 
# Choose instance from below list based on MAX Memory and CPU defined in nextflow config for docker profile.
# Instane types: "c1.medium","c3.2xlarge","c3.4xlarge","c3.8xlarge","c3.large","c3.xlarge","c4.8xlarge","c5d.18xlarge","c5d.24xlarge","c5d.2xlarge","c5d.4xlarge","c5d.9xlarge","c5d.large","c5d.xlarge","c6gd.large","i3.16xlarge","i3.2xlarge","i3.4xlarge","i3.8xlarge","i3en.xlarge","m1.large","m1.medium","m1.small","m1.xlarge","m2.2xlarge","m2.4xlarge","m3.2xlarge","m3.large","m3.medium","m3.xlarge","m5d.12xlarge","m5d.2xlarge","m5d.4xlarge","m5d.8xlarge","m5d.large","m5d.xlarge","m6gd.medium","r3.2xlarge","r3.4xlarge","r3.large","r3.xlarge","t3.micro","t3.nano","t4g.nano","x1e.16xlarge"
infra:
  instance_type: c1.medium 
save_app_node_id: 'save'
tags: [nf, pipelinename]

validation:
  required:
    filetypes:
      - fastq
    genome: false # True if basepair genome files will be going to use
    num_samples: '1' # Default '1' analysis work for single sample at a time where '1+' indicates more than 1 sample. 
    num_controls: '0' # Add control sample counts
    paired: true # switch to false for single end data
    datatype:
      - dna-seq 

edges:
- parent_node_id: '-1'
  app_node_id: 'start'

- parent_node_id: 'start'
  app_node_id: 'modulename'

- parent_node_id: 'modulename'
  app_node_id: 'stop'

nodes:
  'save':
    app_id: '9'
    info:
      bucket: bucket

  'start':
    app_id: '5'
    info:
      dirname: compute_basedir

  'modulename':
    app_id: 'Id used at module creation'
    info:
      num_threads: num_threads #fetched from analysis api
      memory: memory #fetched from analysis api
      bucket: bucket #fetched from analysis api
      all_sample_ids: all_sample_ids #fetched from analysis api
      storage_basedir: storage_basedir #fetched from analysis api
      basedir: compute_basedir #fetched from analysis api
      genome_name: genome_id #fetched from analysis api
      fasta: genome_id #fetched from analysis api
      genome_id: genome_id #fetched from analysis api
      slug: slug #fetched from analysis api

  'stop':
    app_id: '22'
    info:
      compute_basedir: compute_basedir

Push Pipeline YAML to basepair database

# Pipeline create command: creates a new pipeline in the BP database and returns id which needs to be added to the top of pipeline YAML  eg. id: 10000
basepair pipeline create  --file ~/pathtopipeline/pipelinename.yaml

# Pipeline update command:
basepair pipeline update  --file ~/pathtopipeline/pipelinename.yaml -u 10000