Skip to content

Commit 2c52704

Browse files
authored
Merge pull request #28 from bigict/dev
fix: init db
2 parents 48fe7ca + 4d0517c commit 2c52704

File tree

3 files changed

+150
-16
lines changed

3 files changed

+150
-16
lines changed

data/tcr_pmhc_db/test.idx

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
1bd2_P
2+
1fo0_P
3+
1g6r_P
4+
1kj2_P
5+
1lp9_P
6+
1mwa_P
7+
1nam_P
8+
1qse_P
9+
1rc3_P
10+
2bnq_P
11+
2bnr_P
12+
2ckb_P
13+
2e7l_P
14+
2esv_P
15+
2f53_P
16+
2f54_P
17+
2gj6_P
18+
2nx5_P
19+
2oi9_P
20+
2ol3_P
21+
2p5e_P
22+
2p5w_P
23+
2pye_P
24+
2vlk_P
25+
2vlr_P
26+
2ypl_P
27+
3dxa_P
28+
3e3q_P
29+
3ffc_P
30+
3gsn_P
31+
3h9s_P
32+
3hg1_P
33+
3kpr_P
34+
3mv7_P
35+
3mv8_P
36+
3mv9_P
37+
3o4l_P
38+
3pqy_P
39+
3qdg_P
40+
3qdj_P
41+
3qdm_P
42+
3qeq_P
43+
3qfj_P
44+
3tfk_P
45+
3tjh_P
46+
3uts_P
47+
3utt_P
48+
3vxm_P
49+
3vxr_P
50+
3vxs_P
51+
3w0w_P
52+
4eup_P
53+
4g8g_P
54+
4jfd_P
55+
4jfe_P
56+
4jff_P
57+
4l3e_P
58+
4mji_P
59+
4mnq_P
60+
4mvb_P
61+
4mxq_P
62+
4prp_P
63+
4qok_P
64+
4qrp_P
65+
5brz_P
66+
5bs0_P
67+
5c07_P
68+
5c08_P
69+
5c09_P
70+
5c0a_P
71+
5c0b_P
72+
5c0c_P
73+
5d2n_P
74+
5e9d_P
75+
5eu6_P
76+
5euo_P
77+
5hhm_P
78+
5hho_P
79+
5hyj_P
80+
5ivx_P
81+
5jhd_P
82+
5m00_P
83+
5men_P
84+
5nme_P
85+
5til_P
86+
5wlg_P
87+
5xov_P
88+
5yxn_P
89+
6TRo_P
90+
6amu_P
91+
6bj3_P
92+
6dkp_P
93+
6eqb_P
94+
6g9q_P
95+
6l9l_P
96+
6mtm_P
97+
6q3s_P
98+
6rp9_P
99+
6tmo_P
100+
6vma_P
101+
7jwj_P
102+
7n1f_P
103+
7rm4_P

predict.sh

Lines changed: 46 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -40,30 +40,47 @@ if [ $# -eq 0 ]; then
4040
help 1
4141
fi
4242

43+
############################
44+
echo "initialize db"
45+
############################
46+
db_dir=${CWD}/data/tcr_pmhc_db
47+
48+
for c in "M" "P" "A" "B"; do
49+
if [ ! -e ${data_dir}_${c}.fa ]; then
50+
find ${db_dir}/fasta -name "*_${c}.fasta" -exec awk '$0!=""{print $0}' {} \; > ${db_dir}_${c}.fa;
51+
fi
52+
done
53+
54+
4355
csv_file=$*
4456

45-
# convert csv to fasta files
57+
############################
58+
echo "convert csv to fasta files"
59+
############################
4660
python ${CWD}/main.py csv_to_fasta \
4761
--target_uri "${output_dir}${output_params}" \
4862
--pid_prefix tcr_pmhc_test_ \
4963
--default_y=1.0 \
5064
--verbose \
5165
${csv_file}
5266

53-
# make chain.idx
67+
############################
68+
echo "make chain.idx"
69+
############################
5470
cat ${output_dir}/mapping.idx_all | \
5571
cut -f2 | \
5672
awk -F _ '{printf("%s",$1);for (i=2;i<NF;++i) printf("_%s", $i); printf(" %s\n", $NF);}' | \
5773
sort -T . | \
5874
awk -f ${CWD}/scripts/collapse.awk > ${output_dir}/chain.idx_all
5975

60-
# filter out ones that has only one chain
61-
# 1. load dict a (in test dataset) from attr.idx_all
62-
# 2. filter out those that:
63-
# i. has no peptide
64-
# ii. only have peptide & MHC and in dict a
65-
# iii.has only one chain
66-
#
76+
############################
77+
echo "filter out ones that has only one chain"
78+
echo " 1. load dict a (in test dataset) from attr.idx_all"
79+
echo " 2. filter out those that:"
80+
echo " i. has no peptide"
81+
echo " ii. only have peptide & MHC and in dict a"
82+
echo " iii.has only one chain"
83+
############################
6784
cat ${output_dir}/chain.idx_all | \
6885
awk -v attr_idx=${output_dir}/attr.idx_all 'BEGIN{
6986
while(getline<attr_idx) {
@@ -84,7 +101,9 @@ cat ${output_dir}/chain.idx_all | \
84101
print $0;
85102
}' > ${output_dir}/chain.idx_all_blacklist
86103

87-
# make attr.idx for test fold_i
104+
############################
105+
echo "make attr.idx"
106+
############################
88107
cat ${output_dir}/attr.idx_all | \
89108
awk -v blacklist=${output_dir}/chain.idx_all_blacklist 'BEGIN{
90109
a["xxxxxxxx"] = 1;
@@ -102,7 +121,9 @@ python ${CWD}/main.py attr_update_weight_and_task \
102121
--weight 1.0 \
103122
data/tcr_pmhc_db/attr.idx >> ${output_dir}/attr.idx
104123

105-
# build the dataset (test data included) mapping.idx and chain.idx
124+
############################
125+
echo "build the dataset: mapping.idx and chain.idx"
126+
############################
106127
cat ${CWD}/data/tcr_pmhc_db/mapping.idx ${output_dir}/mapping.idx_all > ${output_dir}/mapping.idx
107128
cat ${output_dir}/mapping.idx | \
108129
cut -f2 | \
@@ -111,7 +132,9 @@ cat ${output_dir}/mapping.idx | \
111132
awk -f ${CWD}/scripts/collapse.awk > ${output_dir}/chain.idx
112133

113134

114-
# build fasta for each chain
135+
############################
136+
echo "build fasta for each chain"
137+
############################
115138
for c in "A" "B" "P" "M"; do
116139
python ${CWD}/main.py fasta_extract \
117140
--target_uri ${output_dir} \
@@ -123,14 +146,18 @@ for c in "A" "B" "P" "M"; do
123146
fi
124147
done
125148

126-
# align A B M with jackhmmer
149+
############################
150+
echo "align chains A, B and M with jackhmmer"
151+
############################
127152
for c in "A" "B" "M"; do
128153
find ${CWD}/data/tcr_pmhc_db/fasta -name "*_${c}.fasta" > ${output_dir}/tcr_pmhc_db_${c}
129154
cat ${output_dir}/tcr_pmhc_db_${c} | ${CWD}/bin/mapred -m "uniref90_db=${output_dir}/tcr_pmhc_${c}.fa mgnify_db=${CWD}/data/tcr_pmhc_db_${c}.fa sh ${CWD}/scripts/run_jackhmmer.sh -o ${output_dir}/a3m" -c 10
130155
cat ${output_dir}/tcr_pmhc_db_${c} | ${CWD}/bin/mapred -m "PIPELINE_UNIREF_MAX_HITS=1000000 PIPELINE_MGNIFY_MAX_HITS=1000000 PIPELINE_DEDUPLICATE=0 sh ${CWD}/scripts/run_pipeline.sh -o ${output_dir}/a3m" -c 10
131156
done
132157

133-
# align P with equal length
158+
############################
159+
echo "align chain P with equal length"
160+
############################
134161
python ${CWD}/main.py peptide_align \
135162
--output_dir ${output_dir}/a3m \
136163
--target_db ${output_dir}/tcr_pmhc_P.fa \
@@ -144,19 +171,23 @@ for c in "P"; do
144171
done
145172

146173
# filter a3m with threshold=t
174+
############################
175+
echo "filter a3m (MHC): align_ratio>=${mhc_align_ratio_threshold}"
176+
############################
147177
if [ -d ${output_dir}/var ]; then
148178
rm -rf ${output_dir}/var
149179
fi
150180

151-
echo "filter a3m (MHC): align_ratio>=${mhc_align_ratio_threshold}"
152181
cp -r ${output_dir}/a3m ${output_dir}/var
153182
python ${CWD}/main.py a3m_filter \
154183
--output_dir ${output_dir}/var \
155184
--aligned_ratio_threshold ${mhc_align_ratio_threshold} \
156185
--trim_gap \
157186
${CWD}/data/tcr_pmhc_db/fasta/*_M.fasta
158187

188+
############################
159189
echo "predict ${csv_file}"
190+
############################
160191
python main.py predict \
161192
${model_args} \
162193
--output_dir ${output_dir}/pred \

profold2

Submodule profold2 updated 69 files

0 commit comments

Comments
 (0)