Day 3 - Array jobs and pipelines
Array jobs
qsub -cwd -o out -e err -t 1-10 script.sh
- if
out
/err
is an existing directory, output files placed there
- if
SGE_TASK_ID
environment variable- pipelines
- make
A simple pipeline
extract_species.py
find_cats.sh
count_cats.sh
get_sequence_length.sh
#!/bin/bash
#
# process_species.sh
#
# Darren Kessner
# HPC Training Workshop
# Cedars-Sinai Medical Center
# April 2022
#
taskid=$SGE_TASK_ID
if [ "$taskid" = "" ]
then
echo "Usage: SGE_TASK_ID=n process_species.sh"
exit 1
fi
echo "taskid: $taskid"
filename_fa=$taskid.fa
filename_catlist=$taskid.catlist
filename_catcount=$taskid.catcount
filename_len=$taskid.len
# extract single species
echo "extract_species -> $filename_fa"
./extract_species.py bacteria.fasta $taskid > $filename_fa
# find CATs
echo "find_cats-> $filename_catlist"
./find_cats.sh $filename_fa > $filename_catlist
# count CATs
echo "find_cats-> $filename_catcount"
./count_cats.sh $filename_catlist > $filename_catcount
# sequence length
echo "get_sequence_length.sh -> $filename_len"
./get_sequence_length.sh $filename_fa > $filename_len
Example of converting task id to filename
#!/bin/bash
#
# process_file.sh
#
if [ "$SGE_TASK_ID" = "" ]
then
echo "Usage: SGE_TASK_ID=n process_species.sh"
exit 1
fi
taskid=$SGE_TASK_ID
# calculate filename based on task id
filename=$(head -$taskid filenames.txt | tail -1)
echo "Processing $taskid"
echo "filename: $filename"
Using a Makefile to define the pipeline
#
# Makefile
#
# Darren Kessner
# HPC Training Workshop
# Cedars-Sinai Medical Center
# April 2022
#
# $* stem of implicit rule match
# $@ target
# $< first prerequisite
%.all: %.catcount %.len
echo Processing task $*
%.fa:
./extract_species.py bacteria.fasta $* > $@
%.catlist: %.fa
./find_cats.sh $< > $@
%.catcount: %.catlist
./count_cats.sh $< > $@
%.len: %.fa
./get_sequence_length.sh $< > $@
clean:
rm -rf *.fa *.catcount *.catlist *.len
# don't delete intermediate files
.PRECIOUS: %.fa %.catlist %.catcount %.len
#!/bin/bash
#
# process_species_2.sh
#
# Darren Kessner
# HPC Training Workshop
# Cedars-Sinai Medical Center
# April 2022
#
taskid=$SGE_TASK_ID
if [ "$taskid" = "" ]
then
echo "Usage: SGE_TASK_ID=n process_species.sh"
exit 1
fi
echo "taskid: $taskid"
make $taskid.all