#!/bin/bash

function print_help_and_exit
{
cat >&2 << 'EOF'

'voronota-js-pdb-utensil-gather-ensemble' script prepares an ensemble of structures in PDB format.

Options:
    --entity-id               string  *  PDB entity ID to start with
    --seq-identity            number  *  sequence identity percents, allowed values are 100, 95, 90 or 70
    --cache-dir               string  *  path to support directory to cache big reference files from PDB
    --output-dir              string  *  path to root output directory to store ensemble info and structures
    --size-condition          number     max allowed number of residues to deviate from maximum size in cluster, default is 80
    --log-to-file             string     file path prefix for file to redirect stdout and stderr to, default is '' to not redirect
    --no-reducing                        flag to not reduce structures to the stable residue set
    --no-superposing                     flag to not perform superposition and related actions
    --no-date                            flag to not name cached clusters file by date
    --help | -h                          flag to display help message and exit
    
Standard output:
    results summary
    
Examples:
    
    voronota-js-pdb-utensil-gather-ensemble --cache-dir ./cache --entity-id 5ITP_1 --seq-identity 90 --output-dir ./output

EOF
exit 1
}

readonly ZEROARG=$0

if [ -z "$1" ]
then
	print_help_and_exit
fi

if [[ $ZEROARG == *"/"* ]]
then
	cd "$(dirname ${ZEROARG})"
	export PATH="$(pwd):${PATH}"
	cd - &> /dev/null
fi

export LC_ALL=C

ENTITYID=""
SEQIDENTITY=""
CACHEDIR=""
OUTPUTDIR=""
SIZECONDITION="80"
LOGTOFILE=""
NOREDUCING="false"
NOSUPERPOSING="false"
NODATE="false"
RUNMT="false"
RUNANALYSISAFTERMT="false"
HELP_MODE="false"

while [[ $# > 0 ]]
do
	OPTION="$1"
	OPTARG="$2"
	shift
	case $OPTION in
	--entity-id)
		ENTITYID="$OPTARG"
		shift
		;;
	--seq-identity)
		SEQIDENTITY="$OPTARG"
		shift
		;;
	--cache-dir)
		CACHEDIR="$OPTARG"
		shift
		;;
	--output-dir)
		OUTPUTDIR="$OPTARG"
		shift
		;;
	--size-condition)
		SIZECONDITION="$OPTARG"
		shift
		;;
	--log-to-file)
		LOGTOFILE="$OPTARG"
		shift
		;;
	--no-reducing)
		NOREDUCING="true"
		;;
	--no-superposing)
		NOSUPERPOSING="true"
		;;
	--no-date)
		NODATE="true"
		;;
	--run-mt)
		RUNMT="true"
		;;
	--run-analysis-after-mt)
		RUNMT="true"
		RUNANALYSISAFTERMT="true"
		;;
	-h|--help)
		HELP_MODE="true"
		;;
	*)
		echo >&2 "Error: invalid command line option '$OPTION'"
		exit 1
		;;
	esac
done

if [ "$HELP_MODE" == "true" ]
then
	print_help_and_exit
fi

if [ -n "$LOGTOFILE" ]
then
	LOGFILE="${LOGTOFILE}logs-ident-${SEQIDENTITY}-sizecond-${SIZECONDITION}/$(echo ${ENTITYID} | sed 's/^\(..\).*$/\1/')/${ENTITYID}.txt"
	mkdir -p "$(dirname ${LOGFILE})"
	exec > "$LOGFILE"
	exec 2>&1
fi

command -v voronota-js &> /dev/null || { echo >&2 "Error: 'voronota-js' executable not in binaries path"; exit 1; }
command -v clustalo &> /dev/null || { echo >&2 "Error: 'clustalo' executable not in binaries path"; exit 1; }

if [ -z "$ENTITYID" ]
then
	echo >&2 "Error: no entity ID provided"
	exit 1
fi

if [ -z "$SEQIDENTITY" ]
then
	echo >&2 "Error: no sequence identity provided"
	exit 1
fi

if [ -z "$CACHEDIR" ]
then
	echo >&2 "Error: no cache directory provided"
	exit 1
fi

if [ -z "$OUTPUTDIR" ]
then
	echo >&2 "Error: no output directory provided"
	exit 1
fi

if [ -z "$SIZECONDITION" ]
then
	echo >&2 "Error: no size condition provided"
	exit 1
fi

ENTITYID="$(echo ${ENTITYID} | tr '[:lower:]' '[:upper:]')"

if [[ ! $ENTITYID =~ ^[0-9][A-Z0-9]{3}_[0-9]+$ ]]
then
	echo >&2 "Error: invalid entity ID, must be '{PDB ID}_{entity number}'"
	exit 1
fi

if [ "$SEQIDENTITY" != "100" ] && [ "$SEQIDENTITY" != "95" ] && [ "$SEQIDENTITY" != "90" ] && [ "$SEQIDENTITY" != "70" ] && [ "$SEQIDENTITY" != "50" ] && [ "$SEQIDENTITY" != "30" ]
then
	echo >&2 "Error: invalid sequence identity, must be 100, 95, 90 or 70"
	exit 1
fi

if [ "$SIZECONDITION" -gt "0" ]
then
	true 
else
	echo >&2 "Error: invalid size condition, must be greater than 0"
	exit 1
fi

mkdir -p "$CACHEDIR"

if [ ! -d "$CACHEDIR" ]
then
	echo >&2 "Error: could not create cache directory '${CACHEDIR}'"
	exit 1
fi

cd "$CACHEDIR"
CACHEDIR="$(pwd)"
cd - &> /dev/null

CLUSTERSFILE="${CACHEDIR}/clusters-by-entity-${SEQIDENTITY}"

if [ "$NODATE" == "true" ]
then
	CLUSTERSFILE="${CLUSTERSFILE}.txt"
else
	CLUSTERSFILE="${CLUSTERSFILE}-year-$(date '+%Y-week-%V').txt"
fi

echo >&2 "... Preparing the sequence-based clustering info"

echo >&2 "...... Checking for clusters file '${CLUSTERSFILE}'"

if [ ! -s "$CLUSTERSFILE" ]
then
	CLUSTERSURL="https://cdn.rcsb.org/resources/sequence/clusters/clusters-by-entity-${SEQIDENTITY}.txt"
	
	echo >&2 "...... Downloading clusters file from '${CLUSTERSURL}'"
	
	curl -s "$CLUSTERSURL" > "$CLUSTERSFILE"
	
	if [ ! -s "$CLUSTERSFILE" ]
	then
		echo >&2 "Error: could not download clusters file from '${CLUSTERSURL}'"
		exit 1
	fi
fi

readonly TMPLDIR=$(mktemp -d)
trap "rm -r $TMPLDIR" EXIT

echo >&2 "...... Getting a cluster of entities that contains entity '${ENTITYID}'"

cat "$CLUSTERSFILE" | egrep "^${ENTITYID}\$|\s${ENTITYID}\$|^${ENTITYID}\s|\s${ENTITYID}\s" | head -1 | sed 's/\s\+/\n/g' | egrep '\S+' \
| while read -r ANENTITYID
do
	if [[ $ANENTITYID =~ ^[0-9][A-Z0-9]{3}_[0-9]+$ ]]
	then
		echo "$ANENTITYID"
	fi
done \
| sort > "${TMPLDIR}/entities"

if [ ! -s "${TMPLDIR}/entities" ]
then
	echo >&2 "Error: no cluster found for entity '${ENTITYID}'"
	exit 1
fi

NUMOFENTITIES="$(cat ${TMPLDIR}/entities | wc -l)"

LISTOFENTITIES="$(cat ${TMPLDIR}/entities | tr '\n' ',' | sed 's/,$//')"

SUBGROUPID="$(cat ${TMPLDIR}/entities | head -1 | sed 's/^\(..\).*$/\1/')"

OUTPUTDIR="${OUTPUTDIR}/${SUBGROUPID}"

RAW_JOBID="$(cat ${TMPLDIR}/entities | head -1)__$(cat ${TMPLDIR}/entities | tail -1)__raw__$(echo ${LISTOFENTITIES} | md5sum | awk '{print $1}')"

RAW_OUTPUTDIR="${OUTPUTDIR}/${RAW_JOBID}"

echo >&2 "... Preparing the raw structural data"

echo >&2 "...... Checking if the raw structural data is already prepared"

### {
if [ -f "${RAW_OUTPUTDIR}/finished" ]
then
	echo >&2 "...... Skipping the raw structural data preparation"
else

echo >&2 "...... Downloading sequences of entities from 'https://www.rcsb.org/fasta/entity/' to 'sequences.fasta'"

curl -s "https://www.rcsb.org/fasta/entity/${LISTOFENTITIES}/download" > "${TMPLDIR}/sequences.fasta"

if [ ! -s "${TMPLDIR}/sequences.fasta" ]
then
	echo >&2 "Error: could not download sequences of entities ${LISTOFENTITIES}"
	exit 1
fi

if [ "$NUMOFENTITIES" -gt "1" ]
then
	echo >&2 "...... Generating MSA using Clustal Omega and saving it to 'msa.fasta'"

	clustalo -i "${TMPLDIR}/sequences.fasta" --wrap 9999999 > "${TMPLDIR}/msa.fasta"
	
	if [ ! -s "${TMPLDIR}/msa.fasta" ]
	then
		echo >&2 "Error: failed to generate MSA with clustalo"
		exit 1
	fi
else
	echo >&2 "...... No MSA needed, moving 'sequences.fasta' to 'msa.fasta'"
	
	cp "${TMPLDIR}/sequences.fasta" "${TMPLDIR}/msa.fasta"
fi

mkdir -p "$RAW_OUTPUTDIR"

if [ ! -d "$RAW_OUTPUTDIR" ]
then
	echo >&2 "Error: could not create output directory '${RAW_OUTPUTDIR}'"
	exit 1
fi

mv "${TMPLDIR}/entities" "${TMPLDIR}/sequences.fasta" "${TMPLDIR}/msa.fasta" "${RAW_OUTPUTDIR}/"

cd "$RAW_OUTPUTDIR"

RAW_OUTPUTDIR="$(pwd)"

echo >&2 "...... Writing a table of PDB IDs, entity chain IDs and sequences to 'pdbid_chain_sequence_table.tsv'"

cat "./entities" \
| while read -r EID
do
	paste -d '\t' \
	  <(cat "./msa.fasta" | egrep "^>${EID}" | sed 's/^.\(....\).\+Chai\S\+\s\+\([^|,\[\s]\+\).*$/\1\t\2/' | awk '{print $1 "\t" $2}') \
	  <(cat "./msa.fasta" | egrep "^>${EID}" -A 1 | tail -1)
done \
> "./pdbid_chain_sequence_table.tsv"

mkdir -p "./struct_to_seq_alignments"
mkdir -p "./structures"

echo >&2 "...... Downloading full structure files from PDB"

cat "./pdbid_chain_sequence_table.tsv" \
| while read -r PDBID CHAINID MSASEQ
do
	PDBIDSUBGROUP="$(echo ${PDBID} | sed 's/^\(..\).*$/\1/')"
	if [ ! -s "${CACHEDIR}/structures/${PDBIDSUBGROUP}/${PDBID}.cif.gz" ]
	then
		mkdir -p "${CACHEDIR}/structures/${PDBIDSUBGROUP}"
		cd "${CACHEDIR}/structures/${PDBIDSUBGROUP}"
		LOCKDIRFORDOWNLOAD="./lock${PDBID}"
		
		if mkdir "$LOCKDIRFORDOWNLOAD" 2>/dev/null
		then
			wget -O "${PDBID}.cif.gz" -q "https://files.rcsb.org/download/${PDBID}.cif.gz"
			if [ ! -s "${PDBID}.cif.gz" ]
			then
				rm "${PDBID}.cif.gz"
			fi
			rm -rf "$LOCKDIRFORDOWNLOAD"
		else
			while [ -d "$LOCKDIRFORDOWNLOAD" ]
			do
				sleep 1
			done
		fi
		
		if [ ! -s "${PDBID}.cif.gz" ]
		then
			echo >&2 "Error: could not download mmCIF file for PDB ID '${PDBID}'"
			exit 1
		fi
		
		cd - &> /dev/null
	fi
done

echo >&2 "...... Importing structure files, renumbering the entity chains by sequences, saving structures"

cat "./pdbid_chain_sequence_table.tsv" \
| while read -r PDBID CHAINID MSASEQ
do
PDBIDSUBGROUP="$(echo ${PDBID} | sed 's/^\(..\).*$/\1/')"
PDBCACHEDFILE="${CACHEDIR}/structures/${PDBIDSUBGROUP}/${PDBID}.cif.gz"
{
cat << EOF
var params={}
params.pdb_file='$PDBCACHEDFILE';
params.chain_id='$CHAINID';
params.sequence_str='$MSASEQ';
params.strict='true';
EOF

cat << 'EOF'
voronota_setup_defaults("-no-load-voromqa-potentials", "-no-load-more-atom-types", "-no-load-mock-voromqa-potential -include-heteroatoms");
voronota_assert_full_success("Failed to setup defaults");

voronota_import_mmcif("-files", [params.pdb_file], "-use-label-ids");
voronota_assert_full_success("Failed to fetch structure");

voronota_restrict_atoms("-use", "[-protein]");
voronota_assert_full_success("Failed to restrict atoms to protein polymer residues");

var selection="[-chain "+params.chain_id+"]";

voronota_restrict_atoms("-use", selection);
voronota_assert_full_success("Failed to restrict atoms by chain ID");

var alignment_file="./struct_to_seq_alignments/${objectname}_"+params.chain_id;

voronota_set_chain_residue_numbers_by_sequence("-use "+selection+" -sequence-string '"+params.sequence_str+"' -keep-dashes -alignment-file '"+alignment_file+"' -only-equal-pairs "+params.strict);
voronota_assert_full_success("Failed to align sequences and renumber");

voronota_export_atoms_to_mmcif("-file", "./structures/${objectname}_"+params.chain_id+".cif");
voronota_assert_full_success("Failed to output structure");
EOF

} \
| voronota-js --no-setup-defaults

done

echo >&2 "...... Writing table with structure-to-sequence alignments to 'structure_sequence_table.tsv'"

find ./struct_to_seq_alignments/ -type f | sort \
| while read -r AFILE
do
	echo "$(basename ${AFILE})" "$(tail -1 ${AFILE})" "$(tail -1 ${AFILE} | tr -d '\n' | tr -d '-' | wc -c)"
done \
| awk '{print $1 "\t" $2 "\t" $3}' \
> "./structure_sequence_table.tsv"

MAXSTRUCTURELENGTH="$(cat ./structure_sequence_table.tsv | awk 'NR == 1 || $3 > max { max = $3 } END { print max }')"

cat ./structure_sequence_table.tsv \
| awk -v maxlength="${MAXSTRUCTURELENGTH}" '{print $1 "\t" $2 "\t" $3 "\t" (maxlength-$3)}' \
| sort -n -k4,4 \
> "./structure_sequence_table_more.tsv"

mv "./structure_sequence_table_more.tsv" "./structure_sequence_table.tsv"

rm -r "./struct_to_seq_alignments"

echo >&2 "...... Creating the cluster raw processing finish file 'finished'"

echo "yes" > "./finished"

fi
### }

cd "$RAW_OUTPUTDIR"

RAW_OUTPUTDIR="$(pwd)"

REDUCED_JOBID="$(echo ${RAW_JOBID} | sed 's|__raw__.*$|__reduced_|')"

if [ "$NOSUPERPOSING" == "true" ]
then
	REDUCED_JOBID="${REDUCED_JOBID}_nosuperpos"
fi

REDUCED_JOBID="${REDUCED_JOBID}_$(cat ./structure_sequence_table.tsv | awk -v maxloss=${SIZECONDITION} '{if($4<=maxloss){print $1}}' | sort | md5sum | awk '{print $1}')"

cd ../

REDUCED_OUTPUTDIR="$(pwd)/${REDUCED_JOBID}"

echo >&2 "... Preparing the reduced structural data"

echo >&2 "...... Checking if the reduced structural data is already prepared"

### {
if [ -f "${REDUCED_OUTPUTDIR}/finished" ] || [ "$NOREDUCING" == "true" ]
then
	echo >&2 "...... Skipping the reduced structural data preparation"
else

mkdir -p "$REDUCED_OUTPUTDIR"

if [ ! -d "$REDUCED_OUTPUTDIR" ]
then
	echo >&2 "Error: could not create output directory '${RAW_OUTPUTDIR}'"
	exit 1
fi

cd "$REDUCED_OUTPUTDIR"

echo >&2 "...... Preparing the length loss-conditioned table of structure sequence info table 'conditioned_structure_sequence_table.tsv'"

cat "${RAW_OUTPUTDIR}/structure_sequence_table.tsv" \
| awk -v maxloss=${SIZECONDITION} '{if($4<=maxloss){print $0}}' \
> "./conditioned_structure_sequence_table.tsv"

echo >&2 "...... Saving unstable residue occurrencies to 'unstable_residue_numbers'"

cat "./conditioned_structure_sequence_table.tsv" \
| awk '{print $2}' | awk 'BEGIN {FS = ""} {for(i=1;i<=NF;i++){if($i=="-"){cols_with_dash[i]=1}}} END {for(col in cols_with_dash){printf col ","}}' \
| tr ',' '\n' \
> "./unstable_residue_numbers"

RESIDUESTOREMOVE="$(cat ./unstable_residue_numbers | tr '\n' ',' | sed 's/,$//')"

mkdir -p "./structures_reduced"
mkdir -p "./sequences_reduced"

echo >&2 "...... Reducing structures to exclude unstable residue numbers, exporting resulting sequences with gaps"

cat "./conditioned_structure_sequence_table.tsv" \
| awk '{print $1}' \
| while read -r STRUCTNAME
do
{
cat << EOF
var params={}
params.raw_data_dir='$RAW_OUTPUTDIR';
params.struct_name='$STRUCTNAME';
params.residues_to_remove='$RESIDUESTOREMOVE';
EOF

cat << 'EOF'
voronota_setup_defaults("-no-load-voromqa-potentials", "-no-load-more-atom-types", "-no-load-mock-voromqa-potential -include-heteroatoms");
voronota_assert_full_success("Failed to setup defaults");

voronota_import_mmcif("-files", params.raw_data_dir+"/structures/"+params.struct_name+".cif");
voronota_assert_full_success("Failed to import structure");

if(params.residues_to_remove)
{
	voronota_restrict_atoms("-use", "(not [-rnum "+params.residues_to_remove+"])");
	voronota_assert_full_success("Failed to restrict atoms to stable residues");
}

voronota_export_atoms_to_mmcif("-file", "./structures_reduced/"+params.struct_name+".cif");
voronota_assert_full_success("Failed to output structure");

voronota_export_sequence("-file", "./sequences_reduced/"+params.struct_name, "-gap-filler", "dash");
voronota_assert_full_success("Failed to output sequence");
EOF

} \
| voronota-js --no-setup-defaults

done

echo >&2 "...... Preparing the reduced structure sequence info table 'structure_reduced_sequence_table.tsv'"

find ./sequences_reduced/ -type f | sort \
| while read -r SFILE
do
	echo "$(basename ${SFILE})" "$(tail -1 ${SFILE})" "$(tail -1 ${SFILE} | tr -d '\n' | tr -d '-' | wc -c)"
done \
| awk '{print $1 "\t" $2 "\t" $3}' \
> "./structure_reduced_sequence_table.tsv"

rm -r "./sequences_reduced"


echo >&2 "...... Preparing the reduced structure MSA file 'structure_reduced_msa.fasta'"

cat "./structure_reduced_sequence_table.tsv" \
| awk '{print ">" $1 "\n" $2}' \
> "./structure_reduced_msa.fasta"

if [ "$(cat ./structure_reduced_sequence_table.tsv | wc -l)" -lt "2" ]
then
	echo >&2 "... Abandoning further operations because there is only one structure"
	echo "yes" > "./finished"
elif [ "$NOSUPERPOSING" == "true" ]
then
	echo >&2 "... Abandoning further operations because no superposing was requested"
	echo "yes" > "./finished"
else

echo >&2 "...... Superposing reduced structured using qcprot, saving RMSD info and superposed structures"

mkdir -p "./structures_reduced_superposed"

{
cat << EOF
var input_files_array=[];
EOF

find ./structures_reduced/ -type f | sort \
| while read -r STRUCTFILE
do
STRUCTFILEBASENAME="$(basename ${STRUCTFILE} .cif)"
cat << EOF
input_files_array.push({"path":"$STRUCTFILE", "title":"$STRUCTFILEBASENAME"});
EOF
done

cat << 'EOF'
voronota_setup_defaults("-no-load-voromqa-potentials", "-no-load-more-atom-types", "-no-load-mock-voromqa-potential -include-heteroatoms");
voronota_assert_full_success("Failed to setup defaults");

for(var i=0;i<input_files_array.length;i++)
{
	voronota_import_mmcif("-files", [input_files_array[i].path], "-title", input_files_array[i].title);
	voronota_assert_full_success("Failed to import structure '"+input_files_array[i].path+"'");
}

voronota_pick_objects();

voronota_set_chain_name("-chain-name", "A");
voronota_assert_full_success("Failed to set chain name to 'A'");

voronota_qcprot_many("-target-sel", "[]", "-model-sel", "[]", "-output-rmsd-file", "./rmsd_for_all_pairs", "-all-to-all", "-verify-atom-ids");
voronota_assert_full_success("Failed to compute optimal RMSD using qcprot for all pairs");

var representative_name=voronota_last_output().results[0].output.qcprot_results_summary.min_mean_rmsd_target;

voronota_qcprot_many("-target", representative_name, "-target-sel", "[]", "-model-sel", "[]", "-output-rmsd-file", "./rmsd_for_pairs_with_representive", "-verify-atom-ids");
voronota_assert_full_success("Failed to compute optimal RMSD using qcprot for pairs with representive");

voronota_export_atoms_to_mmcif("-file", "./structures_reduced_superposed/${objectname}.cif");
voronota_assert_full_success("Failed to output structures");

voronota_export_atoms_to_mmcif_multimodel("-file", "./structures_reduced_superposed_multimodel_numbered.cif", "-model-numbers");
voronota_assert_full_success("Failed to output structures to multimodel numbered mmCIF");

voronota_export_atoms_to_mmcif_multimodel("-file", "./structures_reduced_superposed_multimodel_named.cif");
voronota_assert_full_success("Failed to output structures to multimodel named mmCIF");
EOF
} \
| voronota-js --no-setup-defaults

echo "yes" > "./finished"

fi

fi
### }

echo >&2 "... Summarizing output"

echo
echo "List of entities in the cluster: $(cat ${RAW_OUTPUTDIR}/entities | tr '\n' ' ')"

echo
echo "Stats:"
{
echo "$(cat ${RAW_OUTPUTDIR}/entities | wc -l) - number_of_entities"
echo "$(cat ${RAW_OUTPUTDIR}/structure_sequence_table.tsv | wc -l) - number_of_raw_structures"
if [ -f "${REDUCED_OUTPUTDIR}/structure_reduced_sequence_table.tsv" ]
then
echo "$(cat ${REDUCED_OUTPUTDIR}/structure_reduced_sequence_table.tsv | wc -l) - number_of_reduced_structures"
fi
if [ -f "${REDUCED_OUTPUTDIR}/rmsd_for_all_pairs" ]
then
echo "$(cat ${REDUCED_OUTPUTDIR}/rmsd_for_all_pairs | head -1 | awk '{print $3}') - max_RMSD"
fi
} | column -t | tr '_' ' '

echo
echo "Base directories:"
{
echo "$CACHEDIR/ - cache"
echo "$(dirname ${RAW_OUTPUTDIR})/  - output"
} | column -t

echo
echo "Raw structural data files:"
echo "$(basename ${RAW_OUTPUTDIR})/                              -  root raw data directory"
echo "$(basename ${RAW_OUTPUTDIR})/sequences.fasta               -  downloaded sequences"
echo "$(basename ${RAW_OUTPUTDIR})/msa.fasta                     -  MSA from sequences"
echo "$(basename ${RAW_OUTPUTDIR})/structure_sequence_table.tsv  -  summary table"
echo "$(basename ${RAW_OUTPUTDIR})/structures/                   -  directory with renumbered structures"

if [ -d "${REDUCED_OUTPUTDIR}" ]
then
echo
echo "Reduced structural data files:"
echo "$(basename ${REDUCED_OUTPUTDIR})/                                                       -  root reduced data directory"
echo "$(basename ${REDUCED_OUTPUTDIR})/unstable_residue_numbers                               -  removed unstable residue numbers"
echo "$(basename ${REDUCED_OUTPUTDIR})/structure_reduced_sequence_table.tsv                   -  summary table"
echo "$(basename ${REDUCED_OUTPUTDIR})/structure_reduced_msa.fasta                            -  MSA from sequences of reduced structures"
fi

if [ -s "${REDUCED_OUTPUTDIR}/rmsd_for_all_pairs" ]
then
echo "$(basename ${REDUCED_OUTPUTDIR})/rmsd_for_all_pairs                                     -  RMSD for all pairs of reduced structures"
echo "$(basename ${REDUCED_OUTPUTDIR})/structures_reduced_superposed/                         -  directory with reduced and superposed structures"
echo "$(basename ${REDUCED_OUTPUTDIR})/structures_reduced_superposed_multimodel_named.cif     -  mmCIF file with string model IDs"
echo "$(basename ${REDUCED_OUTPUTDIR})/structures_reduced_superposed_multimodel_numbered.cif  -  mmCIF file with integer model IDs"
fi

echo
echo "Commands to view raw structures:"
echo "pymol ${RAW_OUTPUTDIR}/structures/*"
echo "voronota-gl ${RAW_OUTPUTDIR}/structures/*"

if [ -d "${REDUCED_OUTPUTDIR}/structures_reduced" ]
then
echo
echo "Commands to view reduced structures:"
echo "pymol ${REDUCED_OUTPUTDIR}/structures_reduced/*"
echo "voronota-gl ${REDUCED_OUTPUTDIR}/structures_reduced/*"
fi

if [ -s "${REDUCED_OUTPUTDIR}/structures_reduced_superposed_multimodel_numbered.cif" ]
then
echo
echo "Commands to view reduced superposed structures:"
echo "pymol '${REDUCED_OUTPUTDIR}/structures_reduced_superposed_multimodel_numbered.cif'"
echo "voronota-gl '${REDUCED_OUTPUTDIR}/structures_reduced_superposed_multimodel_numbered.cif'"
fi

echo

if [ "$RUNMT" != "true" ]
then
	exit 0
fi

if [ "$(cat ${REDUCED_OUTPUTDIR}/structure_reduced_sequence_table.tsv | wc -l)" -lt "2" ]
then
	echo >&2 "... Not applying MT further operations because there is only one structure"
	exit 0
fi

echo >&2 "... Applying MT"

echo >&2 "...... Checking if MT was already applied"

if [ -f "${REDUCED_OUTPUTDIR}/mt/finished" ]
then
	echo >&2 "...... Skipping MT application"
else

echo >&2 "...... Running MT"

command -v MT &> /dev/null || { echo >&2 "Error: 'MT' executable not in binaries path"; exit 1; }

cd "${REDUCED_OUTPUTDIR}"

mkdir -p "./mt"
cd "./mt"

NDOMAINS=10

MT ../structures_reduced_superposed_multimodel_named.cif ../structure_reduced_msa.fasta -s 0 --tree 3 --nDomains \
> "./split.log" \
2> "./split.err"

cat "./split.log" \
| egrep '^Rigid Block ' \
| sed 's/^Rigid\sBlock\s\(\S\+\)\s*\:\s*\(\S.\+\)Loss\s\(\S\+\)\s.*$/\2 \3/' \
| sed 's/\.\.\+//' \
| sed 's/^A//' \
| sed 's/ A/,/g' \
| awk '{print $1 " " $2}' \
| sort -n -k2,2 -r \
| awk -v iteration="$i" '{if(iteration==2 || $2>0.0){print substr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",NR,1) " " $1 " " $2}}' \
> "./mt_domains.txt"

if [ ! -s "./mt_domains.txt" ]
then
	echo >&2 "Error: MT did not produce domain split, see $(pwd)/split.err"
	exit 0
fi

NDOMAINS="$(cat ./mt_domains.txt | wc -l)"
if [ "$NDOMAINS" -lt "2" ]
then
	NDOMAINS="2"
fi

echo >&2 "...... Using MT output to generate marked structures"

cd "${REDUCED_OUTPUTDIR}"

mkdir -p "./mt/marked_structures"

{
cat << EOF
var mt_results=[];
EOF

cat "./mt/mt_domains.txt" \
| while read -r DNAME RNUMSEL LOSSVAL
do
cat << EOF
mt_results.push({"dname":"$DNAME", "rnumsel":"$RNUMSEL"});
EOF
done

cat << 'EOF'
voronota_setup_defaults("-no-load-voromqa-potentials", "-no-load-more-atom-types", "-no-load-mock-voromqa-potential -include-heteroatoms");
voronota_assert_full_success("Failed to setup defaults");

voronota_import_mmcif("-files", ["structures_reduced_superposed_multimodel_named.cif"], "-title", "${model}");

voronota_pick_objects();

voronota_set_chain_name("-chain-name", "A");
voronota_assert_full_success("Failed to set chain name");

for(var i=0;i<mt_results.length;i++)
{
	voronota_set_chain_name("-use", "[-rnum "+mt_results[i].rnumsel+"]", "-chain-name", mt_results[i].dname);
	voronota_assert_full_success("Failed to set chain name");
}

voronota_export_atoms("-as-pdb", "-file", "./mt/marked_structures/${objectname}.pdb");
voronota_assert_full_success("Failed to output structures");
EOF
} \
| voronota-js --no-setup-defaults

echo >&2 "...... Using MT to generate morphs"

mkdir -p "./mt/morphs"

cd "./mt/morphs"

MT ../../structures_reduced_superposed_multimodel_named.cif ../../structure_reduced_msa.fasta -s 10 --tree 3 --nDomains "$NDOMAINS" -o ./morph \
> ../morph.log \
2> ../morph.err

rm ./tmp*

{
cat << EOF
var input_files_array=[];
EOF

find ./ -type f -name 'morph*.cif' | sort \
| while read -r STRUCTFILE
do
STRUCTFILEBASENAME="$(basename ${STRUCTFILE} .cif)"
cat << EOF
input_files_array.push({"path":"$STRUCTFILE", "title":"$STRUCTFILEBASENAME"});
EOF
done

cat << 'EOF'
voronota_setup_defaults("-no-load-voromqa-potentials", "-no-load-more-atom-types", "-no-load-mock-voromqa-potential -include-heteroatoms");
voronota_assert_full_success("Failed to setup defaults");

for(var i=0;i<input_files_array.length;i++)
{
	voronota_import_mmcif("-files", [input_files_array[i].path], "-title", input_files_array[i].title+"_${model}");
	voronota_assert_full_success("Failed to import structure '"+input_files_array[i].path+"'");
}

voronota_pick_objects();

voronota_export_atoms("-as-pdb", "-file", "./${objectname}.pdb");
voronota_assert_full_success("Failed to output structures");
EOF
} \
| voronota-js --no-setup-defaults

rm ./morph*.cif

cd ..

echo "yes" > "./finished"

fi

cd "${REDUCED_OUTPUTDIR}"

echo >&2 "... Summarizing MT running results"

echo
echo "MT split summary:"
cat "./mt/mt_domains.txt" | column -t

echo
echo "MT warnings summary:"
{
echo "$(cat ./mt/split.err | egrep . | wc -l) - lines_in_split.err"
echo "$(cat ./mt/morph.err | egrep . | wc -l) - lines_in_morph.err"
} | column -t | tr '_' ' '

echo
echo "MT output files:"
echo "$(basename ${REDUCED_OUTPUTDIR})/mt/                           -  root data directory"
echo "$(basename ${REDUCED_OUTPUTDIR})/mt/mt_domains.txt             -  split info"
echo "$(basename ${REDUCED_OUTPUTDIR})/mt/marked_structures/         -  directory with structures with domains marked by different chain IDs"
echo "$(basename ${REDUCED_OUTPUTDIR})/mt/morphs/                    -  directory with structured of morph states"

echo
echo "Commands to view marked structures:"
echo "pymol ${REDUCED_OUTPUTDIR}/mt/marked_structures/*"
echo "voronota-gl ${REDUCED_OUTPUTDIR}/mt/marked_structures/*"

echo

if [ "$RUNANALYSISAFTERMT" != "true" ]
then
	exit 0
fi

if [ "$(cat ${REDUCED_OUTPUTDIR}/structure_reduced_sequence_table.tsv | wc -l)" -lt "2" ]
then
	echo >&2 "... Not applying MT further operations because there is only one structure"
	exit 0
fi

echo >&2 "... Running analysis after MT"

echo >&2 "...... Checking if analysis after MT was already done"

if [ -f "${REDUCED_OUTPUTDIR}/mt/analysis/finished" ]
then
	echo >&2 "...... Skipping analysis after MT"
else

echo >&2 "...... Recording inter-domain interface computation times"

mkdir -p "${REDUCED_OUTPUTDIR}/mt/analysis"

cd "${REDUCED_OUTPUTDIR}/mt/analysis"

{
time -p (find ../marked_structures/ -type f | xargs -L 1 voronota-contacts --contacts-query '--no-same-chain --no-solvent' --input > /dev/null)
} &> "./recorded_time_for_slow_mode"

{
time -p (find ../marked_structures/ -type f | xargs -L 1 voronota-js-fast-iface-contacts --input > /dev/null)
} &> "./recorded_time_for_fast_mode"

echo >&2 "...... Computing intra-domain CAD-score scores"

{
find ../marked_structures/ -type f | sort | while read -r TFILE
do
	find ../marked_structures/ -type f | sort | while read -r MFILE
	do
		if [ "$TFILE" != "$MFILE" ]
		then
			voronota-cadscore --cache-dir "./cachedir" -t "$TFILE" -m "$MFILE" --output-header --ignore-residue-names --contacts-query '--match-max-seq-sep 99999'
		fi
	done
done
} \
| awk '{if(NR==1 || NR%2==0){print $0}}' \
> "./cadscore_intra_domain"

echo >&2 "...... Computing inter-domain CAD-score scores"

{
find ../marked_structures/ -type f | sort | while read -r TFILE
do
	find ../marked_structures/ -type f | sort | while read -r MFILE
	do
		if [ "$TFILE" != "$MFILE" ]
		then
			voronota-cadscore --cache-dir "./cachedir" -t "$TFILE" -m "$MFILE" --output-header --ignore-residue-names --contacts-query-inter-chain
		fi
	done
done
} \
| awk '{if(NR==1 || NR%2==0){print $0}}' \
> "./cadscore_inter_domain"

rm -r "./cachedir"

echo >&2 "...... Summarizing CAD-score scores"

R --vanilla << 'EOF' &> /dev/null
dt1=read.table("cadscore_intra_domain", header=TRUE, stringsAsFactors=FALSE);
dt1$intra_domain_cadscore=dt1$score;
dt2=read.table("cadscore_inter_domain", header=TRUE, stringsAsFactors=FALSE);
dt2$inter_domain_cadscore=dt2$score;
dt=merge(dt1, dt2, by=c("target_file", "model_file"));
dt=dt[,c("target_file", "model_file", "intra_domain_cadscore", "inter_domain_cadscore")];
png("intra_domain_vs_inter_domain_cadscore.png");
plot(x=dt$intra_domain_cadscore, y=dt$inter_domain_cadscore, xlim=c(0,1), ylim=c(0,1), xlab="intra-domain CAD-score", ylab="inter-domain CAD-score", main="intra-domain vs inter-domain CAD-score");
dev.off();
EOF

echo "yes" > "./finished"

fi

cd "${REDUCED_OUTPUTDIR}"

echo >&2 "... Summarizing analysis after MT"

echo
echo "Slow mode time for inter-domain interface computation:"
cat "${REDUCED_OUTPUTDIR}/mt/analysis/recorded_time_for_slow_mode" | column -t

echo
echo "Fast mode time for inter-domain interface computation:"
cat "${REDUCED_OUTPUTDIR}/mt/analysis/recorded_time_for_fast_mode" | column -t

echo
echo "Command to view intra-domain vs inter-domain CAD-score plot:"
echo "display '${REDUCED_OUTPUTDIR}/mt/analysis/intra_domain_vs_inter_domain_cadscore.png'"

echo

exit 0

