Skip to content

Commit b0755e4

Browse files
Clean up pairaln
1 parent e019185 commit b0755e4

File tree

2 files changed

+55
-60
lines changed

2 files changed

+55
-60
lines changed

src/commons/Parameters.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,6 @@ class Parameters {
228228
// pairaln mode
229229
static const int PAIRALN_MODE_ALL_PER_SPECIES = 0;
230230
static const int PAIRALN_MODE_COVER_ALL_CHAINS = 1;
231-
static const int PAIRALN_MODE_PROXIMITY = 2;
232231

233232
// pairaln filter
234233
static const int PAIRALN_FILTER_TOP_HIT = 0;

src/util/pairaln.cpp

Lines changed: 55 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -16,34 +16,39 @@
1616
#include <omp.h>
1717
#endif
1818

19+
#ifndef SIZE_T_MAX
20+
#define SIZE_T_MAX ((size_t) -1)
21+
#endif
22+
23+
1924
#define ABS_DIFF(a, b) ( ((a) > (b)) ? ((a) - (b)) : ((b) - (a)) )
2025

2126
class UniProtConverter {
2227
public:
23-
size_t to_structured_number(std::string &uniprot_id) const {
24-
if (uniprot_id.find('-') != std::string::npos) {
25-
uniprot_id = uniprot_id.substr(0, uniprot_id.find('-'));
28+
size_t toStructuredNumber(std::string &uniprotId) const {
29+
if (uniprotId.find('-') != std::string::npos) {
30+
uniprotId = uniprotId.substr(0, uniprotId.find('-'));
2631
}
2732

28-
if (uniprot_id.empty()) return 0;
33+
if (uniprotId.empty()) return 0;
2934

30-
const size_t len = uniprot_id.length();
31-
const char first_char = static_cast<char>(std::toupper(uniprot_id[0]));
35+
const size_t len = uniprotId.length();
36+
const char firstChar = static_cast<char>(std::toupper(uniprotId[0]));
3237

33-
if (len == 6 && (first_char == 'O' || first_char == 'P' || first_char == 'Q')) {
34-
return convertOpqPattern(uniprot_id);
38+
if (len == 6 && (firstChar == 'O' || firstChar == 'P' || firstChar == 'Q')) {
39+
return convertOpqPattern(uniprotId);
3540
}
3641

3742
if ((len == 6 || len == 10)) {
38-
return convertAnrzPattern(uniprot_id);
43+
return convertAnrzPattern(uniprotId);
3944
}
4045

41-
if (uniprot_id[0]=='U' && uniprot_id[1]=='P' && uniprot_id[2] =='I') {
42-
std::string hex_part(uniprot_id.substr(3));
46+
if (uniprotId[0] == 'U' && uniprotId[1] == 'P' && uniprotId[2] == 'I') {
47+
std::string hex_part(uniprotId.substr(3));
4348
return std::stoll(hex_part, nullptr, 16);
4449
}
4550

46-
return 0; // Invalid length or pattern
51+
return 0;
4752
}
4853

4954
private:
@@ -52,16 +57,18 @@ class UniProtConverter {
5257
size_t multiplier = 1;
5358

5459
for (int i = 5; i >= 0; --i) {
55-
const char current_char = static_cast<char>(std::toupper(id[i]));
56-
int char_val = -1;
60+
const char currentChar = static_cast<char>(std::toupper(id[i]));
61+
int charVal = -1;
5762
int radix = 0;
5863
switch (i) {
59-
case 0: char_val = getOpqValue(current_char); radix = 3; break;
60-
case 1: case 5: char_val = getDigitValue(current_char); radix = 10; break;
61-
case 2: case 3: case 4: char_val = getAlphanumValue(current_char); radix = 36; break;
64+
case 0: charVal = getOpqValue(currentChar); radix = 3; break;
65+
case 1: case 5: charVal = getDigitValue(currentChar); radix = 10; break;
66+
case 2: case 3: case 4: charVal = getAlphanumValue(currentChar); radix = 36; break;
6267
}
63-
if (char_val == -1) return 0;
64-
number += static_cast<size_t>(char_val) * multiplier;
68+
if (charVal == -1) {
69+
return 0;
70+
}
71+
number += static_cast<size_t>(charVal) * multiplier;
6572
multiplier *= radix;
6673
}
6774
return number;
@@ -73,30 +80,30 @@ class UniProtConverter {
7380
size_t len = id.length();
7481

7582
for (int i = len - 1; i >= 0; --i) {
76-
const char current_char = static_cast<char>(std::toupper(id[i]));
77-
int char_val = -1;
83+
const char currentChar = static_cast<char>(std::toupper(id[i]));
84+
int charVal = -1;
7885
int radix = 0;
7986

8087
// --- Logic using a switch statement ---
8188
switch (i) {
8289
case 0:
83-
char_val = getAnrzValue(current_char); radix = 23;
90+
charVal = getAnrzValue(currentChar); radix = 23;
8491
break;
8592
case 1: case 5: case 9:
86-
char_val = getDigitValue(current_char); radix = 10;
93+
charVal = getDigitValue(currentChar); radix = 10;
8794
break;
8895
case 2: case 6:
89-
char_val = getAlphaValue(current_char); radix = 26;
96+
charVal = getAlphaValue(currentChar); radix = 26;
9097
break;
9198
case 3: case 4: case 7: case 8:
92-
char_val = getAlphanumValue(current_char); radix = 36;
99+
charVal = getAlphanumValue(currentChar); radix = 36;
93100
break;
94101
default:
95102
return 0;
96103
}
97-
if (char_val == -1) return 0;
104+
if (charVal == -1) return 0;
98105

99-
number += static_cast<size_t>(char_val) * multiplier;
106+
number += static_cast<size_t>(charVal) * multiplier;
100107
multiplier *= radix;
101108
}
102109
return number;
@@ -127,48 +134,36 @@ struct CompareByTaxon {
127134
size_t findNearestPartner(
128135
const unsigned int taxon_id, const Matcher::result_t& query, const std::vector<Matcher::result_t>& results2)
129136
{
130-
// Use a typedef to simplify the long iterator type name, a common C++98 practice.
131137
typedef std::vector<Matcher::result_t>::const_iterator ResultConstIterator;
132-
133-
// 1. Find the sub-range for the given taxon in both vectors.
134138
std::pair<ResultConstIterator, ResultConstIterator> range;
135139
range = std::equal_range(results2.begin(), results2.end(), taxon_id, CompareByTaxon());
136140

137-
// If the taxon isn't in both lists, no pair is possible.
138141
if (range.first == range.second) {
139-
// Use std::make_pair instead of modern brace initialization.
140142
return SIZE_T_MAX;
141143
}
142144

143145
size_t bestIdx = SIZE_T_MAX;
144146
size_t min_dist = SIZE_T_MAX;
145147

146-
147-
// 2. For each result in the first taxon range...
148-
uint64_t query_uniprot_num = CompareUniProt::getUniProtNumber(query);
149-
150-
// 3. ...find the closest partner in the second taxon range using binary search.
148+
size_t query_uniprot_num = CompareUniProt::getUniProtNumber(query);
151149
ResultConstIterator it2 = std::lower_bound(range.first, range.second, query_uniprot_num, CompareUniProt());
152-
153-
// 4. Check the element at the found position.
154150
if (it2 != range.second) {
155-
uint64_t target_uniprot_num = CompareUniProt::getUniProtNumber(*it2);
156-
// Calculate absolute difference for unsigned integers without using std::abs on signed types.
157-
uint64_t dist = ABS_DIFF(target_uniprot_num, query_uniprot_num);
151+
size_t target_uniprot_num = CompareUniProt::getUniProtNumber(*it2);
152+
size_t dist = ABS_DIFF(target_uniprot_num, query_uniprot_num);
158153

159154
if (dist < min_dist) {
160155
min_dist = dist;
161156
bestIdx = std::distance(results2.begin(), it2);
162157
}
163158
}
164159

165-
// 5. Check the element just before the found position.
160+
// Check the element just before the found position.
166161
if (it2 != range.first) {
167162
ResultConstIterator prev_it2 = it2;
168163
--prev_it2; // Use pre-decrement to get the previous iterator.
169164

170-
uint64_t target_uniprot_num = CompareUniProt::getUniProtNumber(*prev_it2);
171-
uint64_t dist = ABS_DIFF(query_uniprot_num, target_uniprot_num);
165+
size_t target_uniprot_num = CompareUniProt::getUniProtNumber(*prev_it2);
166+
size_t dist = ABS_DIFF(query_uniprot_num, target_uniprot_num);
172167

173168
if (dist < min_dist) {
174169
bestIdx = std::distance(results2.begin(), prev_it2);
@@ -177,18 +172,15 @@ size_t findNearestPartner(
177172
return bestIdx;
178173
}
179174

180-
// need for sorting the results
181175
static bool compareByTaxId(const Matcher::result_t &first, const Matcher::result_t &second) {
182176
return (first.dbOrfStartPos < second.dbOrfStartPos);
183177
}
184178

185179
static bool compareByTaxIdAndUniProtNum(const Matcher::result_t &first, const Matcher::result_t &second) {
186-
// 1. Primary Sort Key: Compare the taxon ID.
187180
if (first.dbOrfStartPos != second.dbOrfStartPos) {
188181
return first.dbOrfStartPos < second.dbOrfStartPos;
189182
}
190183

191-
// 2. Secondary Sort Key: Compare the 64-bit UniProt number.
192184
if (first.queryOrfStartPos != second.queryOrfStartPos) {
193185
return first.queryOrfStartPos < second.queryOrfStartPos;
194186
}
@@ -212,12 +204,14 @@ int pairaln(int argc, const char **argv, const Command& command) {
212204
fileToIds[lookup[i].fileNumber].push_back(lookup[i].id);
213205
}
214206
IndexReader *targetHeaderReaderIdx = NULL;
215-
uint16_t extended = DBReader<unsigned int>::getExtendedDbtype(FileUtil::parseDbType(par.db3.c_str()));
216-
bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
217-
targetHeaderReaderIdx = new IndexReader(par.db2, par.threads,
218-
extended & Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC ? IndexReader::SRC_HEADERS : IndexReader::HEADERS,
219-
(touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0);
220-
207+
if(par.pairfilter == Parameters::PAIRALN_FILTER_PROXIMITY) {
208+
uint16_t extended = DBReader<unsigned int>::getExtendedDbtype(FileUtil::parseDbType(par.db3.c_str()));
209+
bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
210+
targetHeaderReaderIdx = new IndexReader(par.db2, par.threads,
211+
extended & Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC
212+
? IndexReader::SRC_HEADERS : IndexReader::HEADERS,
213+
(touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0);
214+
}
221215

222216
std::string db2NoIndexName = PrefilteringIndexReader::dbPathWithoutIndex(par.db2);
223217
MappingReader* mapping = new MappingReader(db2NoIndexName);
@@ -315,11 +309,10 @@ int pairaln(int argc, const char **argv, const Command& command) {
315309
unsigned int taxon = mapping->lookup(resultPerId[i][resIdx].dbKey);
316310
// we don't want to introduce a new field, reuse existing unused field here
317311
resultPerId[i][resIdx].dbOrfStartPos = taxon;
318-
319312
size_t headerId = targetHeaderReaderIdx->sequenceReader->getId(resultPerId[i][resIdx].dbKey);
320313
char *headerData = targetHeaderReaderIdx->sequenceReader->getData(headerId, thread_idx);
321314
std::string targetAccession = Util::parseFastaHeader(headerData);
322-
size_t uniProtNumber = converter.to_structured_number(targetAccession);
315+
size_t uniProtNumber = converter.toStructuredNumber(targetAccession);
323316
resultPerId[i][resIdx].queryOrfStartPos = static_cast<int>(uniProtNumber >> 32);
324317
resultPerId[i][resIdx].queryOrfEndPos = static_cast<int>(uniProtNumber & 0xFFFFFFFF);
325318
}
@@ -417,7 +410,6 @@ int pairaln(int argc, const char **argv, const Command& command) {
417410

418411

419412
unsigned int prevTaxon = UINT_MAX;
420-
// iterate over taxonToPair
421413
size_t resIdxStart = 0;
422414
for (size_t taxonInList: taxonToPair) {
423415
bool taxonFound = false;
@@ -453,10 +445,14 @@ int pairaln(int argc, const char **argv, const Command& command) {
453445
}
454446
resultWriter.close();
455447
qdbr.close();
456-
// clean up
448+
// clean up
457449
delete mapping;
458-
delete targetHeaderReaderIdx;
450+
if(par.pairfilter == Parameters::PAIRALN_FILTER_PROXIMITY) {
451+
delete targetHeaderReaderIdx;
452+
}
459453
alnDbr.close();
460454
return EXIT_SUCCESS;
461455
}
462456

457+
#undef ABS_DIFF
458+
#undef SIZE_T_MAX

0 commit comments

Comments
 (0)