Array jobs

  • qsub -cwd -o out -e err -t 1-10 script.sh
    • if out/err is an existing directory, output files placed there
  • SGE_TASK_ID environment variable
  • pipelines
  • make

A simple pipeline

extract_species.py
find_cats.sh
count_cats.sh
get_sequence_length.sh

process_species.sh

#!/bin/bash
#
# process_species.sh
#
# Darren Kessner
# HPC Training Workshop
# Cedars-Sinai Medical Center
# April 2022
#


taskid=$SGE_TASK_ID

if [ "$taskid" = "" ]
then
    echo "Usage: SGE_TASK_ID=n process_species.sh"
    exit 1
fi

echo "taskid: $taskid"

filename_fa=$taskid.fa
filename_catlist=$taskid.catlist
filename_catcount=$taskid.catcount
filename_len=$taskid.len

# extract single species

echo "extract_species -> $filename_fa"
./extract_species.py bacteria.fasta $taskid > $filename_fa

# find CATs

echo "find_cats-> $filename_catlist"
./find_cats.sh $filename_fa > $filename_catlist

# count CATs

echo "find_cats-> $filename_catcount"
./count_cats.sh $filename_catlist > $filename_catcount

# sequence length

echo "get_sequence_length.sh -> $filename_len"
./get_sequence_length.sh $filename_fa > $filename_len


Example of converting task id to filename

process_file.sh

#!/bin/bash
#
# process_file.sh
#


if [ "$SGE_TASK_ID" = "" ]
then
    echo "Usage: SGE_TASK_ID=n process_species.sh"
    exit 1
fi


taskid=$SGE_TASK_ID

# calculate filename based on task id
filename=$(head -$taskid filenames.txt | tail -1)

echo "Processing $taskid"
echo "filename: $filename"


Using a Makefile to define the pipeline

Makefile

# 
# Makefile
#
# Darren Kessner
# HPC Training Workshop
# Cedars-Sinai Medical Center
# April 2022
#


# $* stem of implicit rule match 
# $@ target 
# $< first prerequisite


%.all: %.catcount %.len
	echo Processing task $*

%.fa:
	./extract_species.py bacteria.fasta $* > $@

%.catlist: %.fa
	./find_cats.sh $< > $@

%.catcount: %.catlist
	./count_cats.sh $< > $@

%.len: %.fa
	./get_sequence_length.sh $< > $@

clean:
	rm -rf *.fa *.catcount *.catlist *.len

# don't delete intermediate files
.PRECIOUS: %.fa %.catlist %.catcount %.len


process_species_2.sh

#!/bin/bash
#
# process_species_2.sh
#
# Darren Kessner
# HPC Training Workshop
# Cedars-Sinai Medical Center
# April 2022
#


taskid=$SGE_TASK_ID

if [ "$taskid" = "" ]
then
    echo "Usage: SGE_TASK_ID=n process_species.sh"
    exit 1
fi

echo "taskid: $taskid"

make $taskid.all