|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +############################################################################################################################################################## |
| 4 | +# This script is meant to be run COMEBin after obtaining the bam files. |
| 5 | +# Author of pipeline: Ziye Wang. |
| 6 | +# For questions, bugs, and suggestions, contact me at zwang17@fudan.edu.cn |
| 7 | +############################################################################################################################################################## |
| 8 | +VERSION="1.0.2" |
| 9 | + |
| 10 | +help_message () { |
| 11 | + echo "" |
| 12 | + echo "COMEBin version: $VERSION" |
| 13 | + echo "Usage: bash run_comebin.sh [options] -a contig_file -o output_dir -p bam_file_path" |
| 14 | + echo "Options:" |
| 15 | + echo "" |
| 16 | + echo " -a STR metagenomic assembly file" |
| 17 | + echo " -o STR output directory" |
| 18 | + echo " -p STR path to access to the bam files" |
| 19 | + echo " -n INT number of views for contrastive multiple-view learning (default=6)" |
| 20 | + echo " -t INT number of threads (default=5)" |
| 21 | + echo " -l FLOAT temperature in loss function (default=0.07 for assemblies with an N50 > 10000, default=0.15 for others)" |
| 22 | + echo " -e INT embedding size for comebin network (default=2048)" |
| 23 | + echo " -c INT embedding size for coverage network (default=2048)" |
| 24 | + echo " -b INT batch size for training process (default=1024)" |
| 25 | + echo "";} |
| 26 | + |
| 27 | +run_file_path=$(dirname $(which run_comebin.sh)) |
| 28 | + |
| 29 | +if [[ $? -ne 0 ]]; then |
| 30 | + echo "cannot find run_comebin.sh file - something went wrong with the installation!" |
| 31 | + exit 1 |
| 32 | +fi |
| 33 | + |
| 34 | + |
| 35 | +######################################################################################################## |
| 36 | +######################## LOADING IN THE PARAMETERS AND RUNNING ######################## |
| 37 | +######################################################################################################## |
| 38 | + |
| 39 | +num_threads=5 |
| 40 | +n_views=6 |
| 41 | +#temperature=0.15 |
| 42 | +emb_szs_forcov=2048 |
| 43 | +emb_szs=2048 |
| 44 | +batch_size=1024 |
| 45 | + |
| 46 | +while getopts a:o:p:n:t:l:e:c:b: OPT; do |
| 47 | + case ${OPT} in |
| 48 | + a) contig_file=$(realpath ${OPTARG}) |
| 49 | + ;; |
| 50 | + o) output_dir=$(realpath ${OPTARG}) |
| 51 | + ;; |
| 52 | + p) bam_file_path=$(realpath ${OPTARG}) |
| 53 | + ;; |
| 54 | + n) n_views=${OPTARG} |
| 55 | + ;; |
| 56 | + t) num_threads=${OPTARG} |
| 57 | + ;; |
| 58 | + l) temperature=${OPTARG} |
| 59 | + ;; |
| 60 | + e) emb_szs=${OPTARG} |
| 61 | + ;; |
| 62 | + c) emb_szs_forcov=${OPTARG} |
| 63 | + ;; |
| 64 | + b) batch_size=${OPTARG} |
| 65 | + ;; |
| 66 | + \?) |
| 67 | +# printf "[Usage] `date '+%F %T'` -i <INPUT_FILE> -o <OUTPUT_DIR> -o <P |
| 68 | +#RODUCT_CODE> -s <SOFTWARE_VERSION> -t <TYPE>\n" >&2 |
| 69 | + exit 1 |
| 70 | + esac |
| 71 | +done |
| 72 | + |
| 73 | +cd ${run_file_path}/COMEBin |
| 74 | + |
| 75 | +# check parameter |
| 76 | +if [ -z "${contig_file}" -o -z "${output_dir}" -o -z "${bam_file_path}" ]; then |
| 77 | + help_message |
| 78 | + exit 1 |
| 79 | +fi |
| 80 | + |
| 81 | + |
| 82 | +if [ -z "$temperature" ]; then |
| 83 | + # Compute the length of each sequence and sort using the awk command |
| 84 | + awk '/^>/ {if (seqlen) print seqlen; seqlen=0; next} {seqlen+=length($0)} END {print seqlen}' "$contig_file" | sort -rn > ${contig_file}_lengths.txt |
| 85 | + |
| 86 | + # CAL N50 |
| 87 | + total_length=$(awk '{sum+=$1} END {print sum}' ${contig_file}_lengths.txt) |
| 88 | + target_length=$(awk -v total="$total_length" 'BEGIN {cutoff=total/2; current=0} {current+=$1; if (current >= cutoff) {print $1; exit}}' ${contig_file}_lengths.txt) |
| 89 | + |
| 90 | + # N50 |
| 91 | + echo "N50: $target_length" |
| 92 | + # Check if N50 is greater than 10000 and set tau accordingly |
| 93 | + if [ "$target_length" -gt 10000 ]; then |
| 94 | + temperature=0.07 |
| 95 | + else |
| 96 | + temperature=0.15 |
| 97 | + fi |
| 98 | + echo "Tau(temperature): ${temperature}" |
| 99 | +else |
| 100 | + echo "Tau(temperature): ${temperature}" |
| 101 | +fi |
| 102 | + |
| 103 | + |
| 104 | + |
| 105 | + |
| 106 | +######################################################################################################## |
| 107 | +###### Get augmentation data |
| 108 | +######################################################################################################## |
| 109 | +folder=${output_dir}/data_augmentation |
| 110 | +keyword="_datacoverage_mean" |
| 111 | + |
| 112 | +if [ -d "$folder" ]; then |
| 113 | + echo "${output_dir}/data_augmentation exists." |
| 114 | + count=$(find "$folder" -maxdepth 1 -type f -name "*$keyword*" | wc -l) |
| 115 | + echo "Number of files containing '$keyword' in the folder: $count" |
| 116 | + if [ "$count" -ne ${n_views} ]; then |
| 117 | + echo "Running data augmentation." |
| 118 | + python main.py generate_aug_data --contig_file ${contig_file} \ |
| 119 | + --out_augdata_path ${output_dir}/data_augmentation \ |
| 120 | + --n_views ${n_views} --bam_file_path ${bam_file_path} --num_threads ${num_threads} |
| 121 | + else |
| 122 | + echo "No need to run data augmentation." |
| 123 | + fi |
| 124 | +else |
| 125 | + echo "${output_dir}/data_augmentation does not exist." |
| 126 | + echo "Running data augmentation." |
| 127 | + python main.py generate_aug_data --contig_file ${contig_file} \ |
| 128 | + --out_augdata_path ${output_dir}/data_augmentation \ |
| 129 | + --n_views ${n_views} --bam_file_path ${bam_file_path} --num_threads ${num_threads} |
| 130 | +fi |
| 131 | + |
| 132 | +if [[ $? -ne 0 ]] ; then echo "Something went wrong with running generating augmentation data. Exiting.";exit 1; fi |
| 133 | + |
| 134 | +######################################################################################################## |
| 135 | +###### Get representation (training process) |
| 136 | +######################################################################################################## |
| 137 | +folder=${output_dir}/comebin_res |
| 138 | +keyword="embeddings.tsv" |
| 139 | + |
| 140 | +if [ -d "$folder" ]; then |
| 141 | + echo "${output_dir}/comebin_res exists." |
| 142 | + count=$(find "$folder" -maxdepth 1 -type f -name "*$keyword*" | wc -l) |
| 143 | + echo "Number of files containing '$keyword' in the folder: $count" |
| 144 | + if [ "$count" -ne 2 ]; then |
| 145 | + echo "Running getting representation." |
| 146 | + python main.py train --data ${output_dir}/data_augmentation \ |
| 147 | + --temperature ${temperature} --emb_szs_forcov ${emb_szs_forcov} \ |
| 148 | + --batch_size ${batch_size} --emb_szs ${emb_szs} --n_views ${n_views} \ |
| 149 | + --add_model_for_coverage \ |
| 150 | + --output_path ${output_dir}/comebin_res --earlystop --addvars --vars_sqrt |
| 151 | + else |
| 152 | + echo "No need to run getting representation." |
| 153 | + fi |
| 154 | +else |
| 155 | + echo "${output_dir}/comebin_res does not exist." |
| 156 | + echo "Running getting representation." |
| 157 | + python main.py train --data ${output_dir}/data_augmentation \ |
| 158 | + --temperature ${temperature} --emb_szs_forcov ${emb_szs_forcov} \ |
| 159 | + --batch_size ${batch_size} --emb_szs ${emb_szs} --n_views ${n_views} \ |
| 160 | + --add_model_for_coverage \ |
| 161 | + --output_path ${output_dir}/comebin_res --earlystop --addvars --vars_sqrt |
| 162 | +fi |
| 163 | + |
| 164 | + |
| 165 | +if [[ $? -ne 0 ]] ; then echo "Something went wrong with running training network. Exiting.";exit 1; fi |
| 166 | + |
| 167 | +######################################################################################################## |
| 168 | +#### Clustering (run Leiden-based clustering methods and get the final result) |
| 169 | +######################################################################################################## |
| 170 | +emb_file=${output_dir}/comebin_res/embeddings.tsv |
| 171 | +seed_file=${contig_file}.bacar_marker.2quarter_lencutoff_1001.seed |
| 172 | + |
| 173 | +python main.py bin --contig_file ${contig_file} \ |
| 174 | +--emb_file ${emb_file} \ |
| 175 | +--output_path ${output_dir}/comebin_res \ |
| 176 | +--seed_file ${seed_file} --num_threads ${num_threads} |
| 177 | + |
| 178 | +python main.py get_result --contig_file ${contig_file} \ |
| 179 | +--output_path ${output_dir}/comebin_res \ |
| 180 | +--seed_file ${seed_file} --num_threads ${num_threads} |
| 181 | + |
| 182 | +if [[ $? -ne 0 ]] ; then echo "Something went wrong with running clustering. Exiting.";exit 1; fi |
| 183 | + |
0 commit comments