Skip to content

Commit 8ef870f

Browse files
Add a workflow to pick more central cluster representatives
1 parent 1668032 commit 8ef870f

File tree

6 files changed

+139
-1
lines changed

6 files changed

+139
-1
lines changed

data/workflow/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ set(GENERATED_WORKFLOWS
2424
workflow/taxonomy.sh
2525
workflow/linsearch.sh
2626
workflow/databases.sh
27+
workflow/pickconsensusrep.sh
2728
workflow/nucleotide_clustering.sh
2829
workflow/iterativepp.sh
2930
workflow/tsv2exprofiledb.sh

data/workflow/pickconsensusrep.sh

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#!/bin/sh -e
2+
3+
fail() {
4+
echo "Error: $1"
5+
exit 1
6+
}
7+
8+
notExists() {
9+
[ ! -f "$1" ]
10+
}
11+
12+
[ -z "$MMSEQS" ] && echo "Please set the environment variable \$MMSEQS to your MMSEQS binary." && exit 1;
13+
[ "$#" -ne 4 ] && echo "Please provide <seqDB> <clusterDB> <outClusterDB> <tmp>" && exit 1;
14+
[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1;
15+
[ ! -f "$2.dbtype" ] && echo "$2.dbtype not found!" && exit 1;
16+
[ ! -d "$4" ] && echo "tmp directory $4 not found!" && mkdir -p "$4";
17+
TMP_PATH="$4"
18+
19+
if notExists "${TMP_PATH}/msa.dbtype"; then
20+
# shellcheck disable=SC2086
21+
"$MMSEQS" result2msa "$1" "$1" "$2" "$TMP_PATH/msa" ${RESULT2MSA_PAR} \
22+
|| fail "result2msa failed"
23+
fi
24+
25+
if notExists "${TMP_PATH}/profile.dbtype"; then
26+
# shellcheck disable=SC2086
27+
"$MMSEQS" msa2profile "$TMP_PATH/msa" "$TMP_PATH/profile" ${MSA2PROFILE_PAR} \
28+
|| fail "result2msa failed"
29+
fi
30+
31+
if notExists "${TMP_PATH}/aln.dbtype"; then
32+
# shellcheck disable=SC2086
33+
"$MMSEQS" align "$TMP_PATH/profile" "$1" "$2" "${TMP_PATH}/aln" || fail "align failed"
34+
fi
35+
36+
if notExists "${TMP_PATH}/aln.tsv"; then
37+
# shellcheck disable=SC2086
38+
"$MMSEQS" prefixid "${TMP_PATH}/aln" "${TMP_PATH}/aln.tsv" --tsv ${VERBOSITY} || fail "prefixid1 -tsv failed"
39+
fi
40+
41+
awk 'FNR == NR{ best[$1]=1; rep[$1] = $1; next}
42+
{
43+
cluster = $1; member = $2; score = $3;
44+
if (!(cluster in best) || score > best[cluster]) {
45+
best[cluster] = score;
46+
rep[cluster] = member;
47+
}
48+
}
49+
END {
50+
for (cluster in rep) {
51+
print cluster "\t" rep[cluster];
52+
}
53+
}' "${TMP_PATH}/aln.index" "${TMP_PATH}/aln.tsv" > "${TMP_PATH}/rep_mapping.txt"
54+
55+
if notExists "${TMP_PATH}/clu.tsv"; then
56+
# shellcheck disable=SC2086
57+
"$MMSEQS" prefixid "$2" "${TMP_PATH}/clu.tsv" --tsv ${VERBOSITY} || fail "prefixid2 -tsv failed"
58+
fi
59+
60+
if notExists "${TMP_PATH}/updated_clu.tsv"; then
61+
awk 'FNR == NR{f[$1] = $2; next}
62+
$1 != prev { print f[$1] "\t" f[$1]; prev = $1; }
63+
$1 in f && $2 != f[$1]{print f[$1]"\t"$2}' "${TMP_PATH}/rep_mapping.txt" "${TMP_PATH}/clu.tsv" > "${TMP_PATH}/updated_clu.tsv"
64+
fi
65+
66+
"$MMSEQS" tsv2db "${TMP_PATH}/updated_clu.tsv" "${3}" --output-dbtype 6 || fail "tsv2db failed"
67+
68+
if [ -n "$REMOVE_TMP" ]; then
69+
rm -f "${TMP_PATH}/updated_clu.tsv"
70+
rm -f "${TMP_PATH}/aln.tsv"
71+
rm -f "${TMP_PATH}/rep_mapping.txt"
72+
rm -f "${TMP_PATH}/clu.tsv"
73+
rm -f "${TMP_PATH}/updated_clu.tsv"
74+
# shellcheck disable=SC2086
75+
"$MMSEQS" rmdb "${TMP_PATH}/msa" ${VERBOSITY}
76+
# shellcheck disable=SC2086
77+
"$MMSEQS" rmdb "${TMP_PATH}/profile" ${VERBOSITY}
78+
# shellcheck disable=SC2086
79+
"$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY}
80+
81+
rm -rf "${TMP_PATH}/pickconsensusrep.sh"
82+
fi

src/CommandDeclarations.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ extern int nrtotaxmapping(int argc, const char **argv, const Command& command);
8686
extern int offsetalignment(int argc, const char **argv, const Command& command);
8787
extern int orftocontig(int argc, const char **argv, const Command& command);
8888
extern int touchdb(int argc, const char **argv, const Command& command);
89+
extern int pickconsensusrep(int argc, const char **argv, const Command& command);
8990
extern int prefilter(int argc, const char **argv, const Command& command);
9091
extern int prefixid(int argc, const char **argv, const Command& command);
9192
extern int profile2cs(int argc, const char **argv, const Command& command);

src/MMseqsBase.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,15 @@ std::vector<Command> baseCommands = {
572572
{"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
573573

574574

575-
575+
{"pickconsensusrep", pickconsensusrep, &par.verbandcompression, COMMAND_CLUSTER,
576+
"Select new representatives for each cluster based on consensus",
577+
NULL,
578+
"Martin Steinegger <martin.steinegger@snu.ac.kr> & Maria Hauser",
579+
"<i:seqDb> <i:clusterDB> <o:clusterDB> <tmpDir>",
580+
CITATION_MMSEQS2, {{"seqDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
581+
{"clusterDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::clusterDb },
582+
{"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::clusterDb },
583+
{"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
576584
{"prefilter", prefilter, &par.prefilter, COMMAND_PREFILTER,
577585
"Double consecutive diagonal k-mer search",
578586
NULL,

src/workflow/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ set(workflow_source_files
1313
workflow/Rbh.cpp
1414
workflow/Search.cpp
1515
workflow/Taxonomy.cpp
16+
workflow/PickConsensusRep.cpp
1617
workflow/EasyTaxonomy.cpp
1718
workflow/CreateIndex.cpp
1819
PARENT_SCOPE

src/workflow/PickConsensusRep.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#include "Parameters.h"
2+
#include "FileUtil.h"
3+
#include "CommandCaller.h"
4+
#include <cassert>
5+
#include <string>
6+
// Include the embedded shell script.
7+
#include "pickconsensusrep.sh.h"
8+
9+
// Minimal workflow function that runs the pickcenterrep workflow.
10+
int pickconsensusrep(int argc, const char **argv, const Command &command) {
11+
Parameters &par = Parameters::getInstance();
12+
par.parseParameters(argc, argv, command, true, 0, 0);
13+
14+
CommandCaller cmd;
15+
par.allowDeletion = 1;
16+
par.PARAM_ALLOW_DELETION.wasSet = true;
17+
cmd.addVariable("RESULT2MSA_PAR", par.createParameterString(par.result2msa, true).c_str());
18+
par.matchMode = 1;
19+
par.PARAM_MATCH_MODE.wasSet = true;
20+
cmd.addVariable("MSA2PROFILE_PAR", par.createParameterString(par.msa2profile, true).c_str());
21+
cmd.addVariable("RENAMEDBKEYS_PAR", par.createParameterString(par.renamedbkeys).c_str());
22+
cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());
23+
24+
// The temporary directory is provided as the 4th argument.
25+
std::string tmpDir = par.db4;
26+
std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, par.mapworkflow));
27+
if (par.reuseLatest) {
28+
hash = FileUtil::getHashFromSymLink(tmpDir + "/latest");
29+
}
30+
tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash);
31+
par.filenames.pop_back();
32+
par.filenames.push_back(tmpDir);
33+
34+
// Write out the embedded shell script to a file in the temporary directory.
35+
std::string program = tmpDir + "/pickconsensusrep.sh";
36+
FileUtil::writeFile(program, pickconsensusrep_sh, pickconsensusrep_sh_len);
37+
38+
// Execute the shell script.
39+
cmd.execProgram(program.c_str(), par.filenames);
40+
41+
// The shell script should not return; if it does, abort.
42+
assert(false);
43+
return 0;
44+
}
45+

0 commit comments

Comments
 (0)