16
16
#include < omp.h>
17
17
#endif
18
18
19
+ #ifndef SIZE_T_MAX
20
+ #define SIZE_T_MAX ((size_t ) -1 )
21
+ #endif
22
+
23
+
19
24
#define ABS_DIFF (a, b ) ( ((a) > (b)) ? ((a) - (b)) : ((b) - (a)) )
20
25
21
26
class UniProtConverter {
22
27
public:
23
- size_t to_structured_number (std::string &uniprot_id ) const {
24
- if (uniprot_id .find (' -' ) != std::string::npos) {
25
- uniprot_id = uniprot_id .substr (0 , uniprot_id .find (' -' ));
28
+ size_t toStructuredNumber (std::string &uniprotId ) const {
29
+ if (uniprotId .find (' -' ) != std::string::npos) {
30
+ uniprotId = uniprotId .substr (0 , uniprotId .find (' -' ));
26
31
}
27
32
28
- if (uniprot_id .empty ()) return 0 ;
33
+ if (uniprotId .empty ()) return 0 ;
29
34
30
- const size_t len = uniprot_id .length ();
31
- const char first_char = static_cast <char >(std::toupper (uniprot_id [0 ]));
35
+ const size_t len = uniprotId .length ();
36
+ const char firstChar = static_cast <char >(std::toupper (uniprotId [0 ]));
32
37
33
- if (len == 6 && (first_char == ' O' || first_char == ' P' || first_char == ' Q' )) {
34
- return convertOpqPattern (uniprot_id );
38
+ if (len == 6 && (firstChar == ' O' || firstChar == ' P' || firstChar == ' Q' )) {
39
+ return convertOpqPattern (uniprotId );
35
40
}
36
41
37
42
if ((len == 6 || len == 10 )) {
38
- return convertAnrzPattern (uniprot_id );
43
+ return convertAnrzPattern (uniprotId );
39
44
}
40
45
41
- if (uniprot_id [0 ]== ' U' && uniprot_id [1 ]== ' P' && uniprot_id [2 ] ==' I' ) {
42
- std::string hex_part (uniprot_id .substr (3 ));
46
+ if (uniprotId [0 ] == ' U' && uniprotId [1 ] == ' P' && uniprotId [2 ] == ' I' ) {
47
+ std::string hex_part (uniprotId .substr (3 ));
43
48
return std::stoll (hex_part, nullptr , 16 );
44
49
}
45
50
46
- return 0 ; // Invalid length or pattern
51
+ return 0 ;
47
52
}
48
53
49
54
private:
@@ -52,16 +57,18 @@ class UniProtConverter {
52
57
size_t multiplier = 1 ;
53
58
54
59
for (int i = 5 ; i >= 0 ; --i) {
55
- const char current_char = static_cast <char >(std::toupper (id[i]));
56
- int char_val = -1 ;
60
+ const char currentChar = static_cast <char >(std::toupper (id[i]));
61
+ int charVal = -1 ;
57
62
int radix = 0 ;
58
63
switch (i) {
59
- case 0 : char_val = getOpqValue (current_char ); radix = 3 ; break ;
60
- case 1 : case 5 : char_val = getDigitValue (current_char ); radix = 10 ; break ;
61
- case 2 : case 3 : case 4 : char_val = getAlphanumValue (current_char ); radix = 36 ; break ;
64
+ case 0 : charVal = getOpqValue (currentChar ); radix = 3 ; break ;
65
+ case 1 : case 5 : charVal = getDigitValue (currentChar ); radix = 10 ; break ;
66
+ case 2 : case 3 : case 4 : charVal = getAlphanumValue (currentChar ); radix = 36 ; break ;
62
67
}
63
- if (char_val == -1 ) return 0 ;
64
- number += static_cast <size_t >(char_val) * multiplier;
68
+ if (charVal == -1 ) {
69
+ return 0 ;
70
+ }
71
+ number += static_cast <size_t >(charVal) * multiplier;
65
72
multiplier *= radix;
66
73
}
67
74
return number;
@@ -73,30 +80,30 @@ class UniProtConverter {
73
80
size_t len = id.length ();
74
81
75
82
for (int i = len - 1 ; i >= 0 ; --i) {
76
- const char current_char = static_cast <char >(std::toupper (id[i]));
77
- int char_val = -1 ;
83
+ const char currentChar = static_cast <char >(std::toupper (id[i]));
84
+ int charVal = -1 ;
78
85
int radix = 0 ;
79
86
80
87
// --- Logic using a switch statement ---
81
88
switch (i) {
82
89
case 0 :
83
- char_val = getAnrzValue (current_char ); radix = 23 ;
90
+ charVal = getAnrzValue (currentChar ); radix = 23 ;
84
91
break ;
85
92
case 1 : case 5 : case 9 :
86
- char_val = getDigitValue (current_char ); radix = 10 ;
93
+ charVal = getDigitValue (currentChar ); radix = 10 ;
87
94
break ;
88
95
case 2 : case 6 :
89
- char_val = getAlphaValue (current_char ); radix = 26 ;
96
+ charVal = getAlphaValue (currentChar ); radix = 26 ;
90
97
break ;
91
98
case 3 : case 4 : case 7 : case 8 :
92
- char_val = getAlphanumValue (current_char ); radix = 36 ;
99
+ charVal = getAlphanumValue (currentChar ); radix = 36 ;
93
100
break ;
94
101
default :
95
102
return 0 ;
96
103
}
97
- if (char_val == -1 ) return 0 ;
104
+ if (charVal == -1 ) return 0 ;
98
105
99
- number += static_cast <size_t >(char_val ) * multiplier;
106
+ number += static_cast <size_t >(charVal ) * multiplier;
100
107
multiplier *= radix;
101
108
}
102
109
return number;
@@ -127,48 +134,36 @@ struct CompareByTaxon {
127
134
size_t findNearestPartner (
128
135
const unsigned int taxon_id, const Matcher::result_t & query, const std::vector<Matcher::result_t >& results2)
129
136
{
130
- // Use a typedef to simplify the long iterator type name, a common C++98 practice.
131
137
typedef std::vector<Matcher::result_t >::const_iterator ResultConstIterator;
132
-
133
- // 1. Find the sub-range for the given taxon in both vectors.
134
138
std::pair<ResultConstIterator, ResultConstIterator> range;
135
139
range = std::equal_range (results2.begin (), results2.end (), taxon_id, CompareByTaxon ());
136
140
137
- // If the taxon isn't in both lists, no pair is possible.
138
141
if (range.first == range.second ) {
139
- // Use std::make_pair instead of modern brace initialization.
140
142
return SIZE_T_MAX;
141
143
}
142
144
143
145
size_t bestIdx = SIZE_T_MAX;
144
146
size_t min_dist = SIZE_T_MAX;
145
147
146
-
147
- // 2. For each result in the first taxon range...
148
- uint64_t query_uniprot_num = CompareUniProt::getUniProtNumber (query);
149
-
150
- // 3. ...find the closest partner in the second taxon range using binary search.
148
+ size_t query_uniprot_num = CompareUniProt::getUniProtNumber (query);
151
149
ResultConstIterator it2 = std::lower_bound (range.first , range.second , query_uniprot_num, CompareUniProt ());
152
-
153
- // 4. Check the element at the found position.
154
150
if (it2 != range.second ) {
155
- uint64_t target_uniprot_num = CompareUniProt::getUniProtNumber (*it2);
156
- // Calculate absolute difference for unsigned integers without using std::abs on signed types.
157
- uint64_t dist = ABS_DIFF (target_uniprot_num, query_uniprot_num);
151
+ size_t target_uniprot_num = CompareUniProt::getUniProtNumber (*it2);
152
+ size_t dist = ABS_DIFF (target_uniprot_num, query_uniprot_num);
158
153
159
154
if (dist < min_dist) {
160
155
min_dist = dist;
161
156
bestIdx = std::distance (results2.begin (), it2);
162
157
}
163
158
}
164
159
165
- // 5. Check the element just before the found position.
160
+ // Check the element just before the found position.
166
161
if (it2 != range.first ) {
167
162
ResultConstIterator prev_it2 = it2;
168
163
--prev_it2; // Use pre-decrement to get the previous iterator.
169
164
170
- uint64_t target_uniprot_num = CompareUniProt::getUniProtNumber (*prev_it2);
171
- uint64_t dist = ABS_DIFF (query_uniprot_num, target_uniprot_num);
165
+ size_t target_uniprot_num = CompareUniProt::getUniProtNumber (*prev_it2);
166
+ size_t dist = ABS_DIFF (query_uniprot_num, target_uniprot_num);
172
167
173
168
if (dist < min_dist) {
174
169
bestIdx = std::distance (results2.begin (), prev_it2);
@@ -177,18 +172,15 @@ size_t findNearestPartner(
177
172
return bestIdx;
178
173
}
179
174
180
- // need for sorting the results
181
175
static bool compareByTaxId (const Matcher::result_t &first, const Matcher::result_t &second) {
182
176
return (first.dbOrfStartPos < second.dbOrfStartPos );
183
177
}
184
178
185
179
static bool compareByTaxIdAndUniProtNum (const Matcher::result_t &first, const Matcher::result_t &second) {
186
- // 1. Primary Sort Key: Compare the taxon ID.
187
180
if (first.dbOrfStartPos != second.dbOrfStartPos ) {
188
181
return first.dbOrfStartPos < second.dbOrfStartPos ;
189
182
}
190
183
191
- // 2. Secondary Sort Key: Compare the 64-bit UniProt number.
192
184
if (first.queryOrfStartPos != second.queryOrfStartPos ) {
193
185
return first.queryOrfStartPos < second.queryOrfStartPos ;
194
186
}
@@ -212,12 +204,14 @@ int pairaln(int argc, const char **argv, const Command& command) {
212
204
fileToIds[lookup[i].fileNumber ].push_back (lookup[i].id );
213
205
}
214
206
IndexReader *targetHeaderReaderIdx = NULL ;
215
- uint16_t extended = DBReader<unsigned int >::getExtendedDbtype (FileUtil::parseDbType (par.db3 .c_str ()));
216
- bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
217
- targetHeaderReaderIdx = new IndexReader (par.db2 , par.threads ,
218
- extended & Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC ? IndexReader::SRC_HEADERS : IndexReader::HEADERS,
219
- (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0 );
220
-
207
+ if (par.pairfilter == Parameters::PAIRALN_FILTER_PROXIMITY) {
208
+ uint16_t extended = DBReader<unsigned int >::getExtendedDbtype (FileUtil::parseDbType (par.db3 .c_str ()));
209
+ bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
210
+ targetHeaderReaderIdx = new IndexReader (par.db2 , par.threads ,
211
+ extended & Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC
212
+ ? IndexReader::SRC_HEADERS : IndexReader::HEADERS,
213
+ (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0 );
214
+ }
221
215
222
216
std::string db2NoIndexName = PrefilteringIndexReader::dbPathWithoutIndex (par.db2 );
223
217
MappingReader* mapping = new MappingReader (db2NoIndexName);
@@ -315,11 +309,10 @@ int pairaln(int argc, const char **argv, const Command& command) {
315
309
unsigned int taxon = mapping->lookup (resultPerId[i][resIdx].dbKey );
316
310
// we don't want to introduce a new field, reuse existing unused field here
317
311
resultPerId[i][resIdx].dbOrfStartPos = taxon;
318
-
319
312
size_t headerId = targetHeaderReaderIdx->sequenceReader ->getId (resultPerId[i][resIdx].dbKey );
320
313
char *headerData = targetHeaderReaderIdx->sequenceReader ->getData (headerId, thread_idx);
321
314
std::string targetAccession = Util::parseFastaHeader (headerData);
322
- size_t uniProtNumber = converter.to_structured_number (targetAccession);
315
+ size_t uniProtNumber = converter.toStructuredNumber (targetAccession);
323
316
resultPerId[i][resIdx].queryOrfStartPos = static_cast <int >(uniProtNumber >> 32 );
324
317
resultPerId[i][resIdx].queryOrfEndPos = static_cast <int >(uniProtNumber & 0xFFFFFFFF );
325
318
}
@@ -417,7 +410,6 @@ int pairaln(int argc, const char **argv, const Command& command) {
417
410
418
411
419
412
unsigned int prevTaxon = UINT_MAX;
420
- // iterate over taxonToPair
421
413
size_t resIdxStart = 0 ;
422
414
for (size_t taxonInList: taxonToPair) {
423
415
bool taxonFound = false ;
@@ -453,10 +445,14 @@ int pairaln(int argc, const char **argv, const Command& command) {
453
445
}
454
446
resultWriter.close ();
455
447
qdbr.close ();
456
- // clean up
448
+ // clean up
457
449
delete mapping;
458
- delete targetHeaderReaderIdx;
450
+ if (par.pairfilter == Parameters::PAIRALN_FILTER_PROXIMITY) {
451
+ delete targetHeaderReaderIdx;
452
+ }
459
453
alnDbr.close ();
460
454
return EXIT_SUCCESS;
461
455
}
462
456
457
+ #undef ABS_DIFF
458
+ #undef SIZE_T_MAX
0 commit comments