Skip to content

Commit 0295764

Browse files
committed
removed cerr debug line
1 parent 8669ddd commit 0295764

File tree

2 files changed

+74
-54
lines changed

2 files changed

+74
-54
lines changed

include/graphtyper/utilities/bgzf_stream.hpp

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@ inline void BGZF_stream::flush()
6969

7070
if (Options::const_instance()->is_on_final_output && Options::const_instance()->encoding == 'p')
7171
{
72-
assert(buffer_in.size() >= ed.remaining_bytes);
73-
ed.bytes_read = buffer_in.size() - ed.remaining_bytes;
72+
assert(buffer_in.size() >= ed.i);
73+
ed.bytes_read = buffer_in.size();
7474
popvcf::encode_buffer(buffer_out, buffer_in, ed);
7575

7676
if (fp == nullptr)
@@ -79,24 +79,22 @@ inline void BGZF_stream::flush()
7979
}
8080
else if (buffer_out.size() > 0)
8181
{
82-
std::cerr << "Writing:'" << buffer_out << "'";
83-
int written_length = bgzf_write(fp, buffer_out.data(), ed.o);
82+
int written_length = bgzf_write(fp, buffer_out.data(), buffer_out.size());
8483

8584
if (written_length < 0)
8685
{
8786
std::cerr << "[bgzf_stream] ERROR: Writing to BGZF file failed. "
8887
<< "Exit code: " << written_length << " . No space left on device?" << std::endl;
8988
std::exit(1);
9089
}
91-
else if (written_length != static_cast<long>(ed.o))
90+
else if (written_length != static_cast<long>(buffer_out.size()))
9291
{
9392
std::cerr << "[bgzf] WARNING: Mismatch between size written and expected: " << written_length
9493
<< " != " << buffer_out.size();
9594
}
9695
}
9796

98-
ed.o = 0;
99-
buffer_in.resize(ed.remaining_bytes);
97+
buffer_in.resize(ed.i);
10098
buffer_out.resize(0);
10199
}
102100
else

include/popvcf/encode.hpp

Lines changed: 69 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -22,22 +22,24 @@ class EncodeData
2222
{
2323
public:
2424
std::size_t bytes_read{0};
25-
std::size_t remaining_bytes{0};
2625
std::size_t field{0}; // current vcf field
2726
std::size_t b{0}; // begin index in buffer_in
2827
std::size_t i{b}; // index in buffer_in
29-
std::size_t o{0}; // output index
3028
bool header_line{true}; //!< True iff in header line
3129
bool no_previous_line{false}; //! Set to skip using previous line
3230

3331
std::string prev_contig{};
34-
uint64_t prev_pos{0};
32+
int64_t prev_pos{0};
3533
uint32_t prev_num_unique_fields{};
34+
std::vector<std::string> prev_unique_fields{};
35+
std::vector<uint32_t> prev_field2uid{};
3636
phmap::flat_hash_map<std::string, uint32_t> prev_map_to_unique_fields{};
3737

3838
std::string contig{};
39-
uint64_t pos{0};
39+
int64_t pos{0};
4040
uint32_t num_unique_fields{};
41+
std::vector<std::string> unique_fields{};
42+
std::vector<uint32_t> field2uid{};
4143
phmap::flat_hash_map<std::string, uint32_t> map_to_unique_fields{};
4244

4345
inline void clear_line()
@@ -49,51 +51,42 @@ class EncodeData
4951
std::swap(prev_contig, contig);
5052
prev_pos = pos;
5153
prev_num_unique_fields = num_unique_fields;
54+
std::swap(prev_unique_fields, unique_fields);
55+
std::swap(prev_field2uid, field2uid);
5256
std::swap(prev_map_to_unique_fields, map_to_unique_fields);
5357
}
5458

5559
contig.resize(0);
5660
pos = 0;
5761
num_unique_fields = 0;
62+
unique_fields.resize(0);
63+
field2uid.resize(0);
5864
map_to_unique_fields.clear(); // clear map every line
5965
}
6066
};
6167

6268
template <typename Tint, typename Tbuffer_out>
63-
inline void to_chars(Tint char_val, Tbuffer_out & buffer_out, EncodeData & ed)
69+
inline void to_chars(Tint char_val, Tbuffer_out & buffer_out)
6470
{
6571
while (char_val >= CHAR_SET_SIZE)
6672
{
6773
auto rem = char_val % CHAR_SET_SIZE;
6874
char_val = char_val / CHAR_SET_SIZE;
69-
buffer_out[ed.o++] = int_to_ascii(rem);
75+
buffer_out.push_back(int_to_ascii(rem));
7076
}
7177

7278
assert(char_val < CHAR_SET_SIZE);
73-
buffer_out[ed.o++] = int_to_ascii(char_val);
74-
}
75-
76-
template <typename Tbuffer_out>
77-
inline void reserve_space(Tbuffer_out & buffer, long const input_size)
78-
{
79-
buffer.resize(std::max(ENC_BUFFER_SIZE, input_size));
80-
}
81-
82-
//! Specialization for array buffers
83-
template <>
84-
inline void reserve_space(Tarray_buf & /*buffer*/, long const /*input_size*/)
85-
{
86-
// NOP
79+
buffer_out.push_back(int_to_ascii(char_val));
8780
}
8881

8982
//! Encodes an input buffer. Output is written in \a buffer_out.
9083
template <typename Tbuffer_out, typename Tbuffer_in>
9184
inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, EncodeData & ed)
9285
{
93-
popvcf::reserve_space(buffer_out, buffer_in.size());
86+
buffer_out.reserve(ENC_BUFFER_SIZE / 2);
9487
std::size_t constexpr N_FIELDS_SITE_DATA{9}; // how many fields of the VCF contains site data
9588

96-
while (ed.i < (ed.bytes_read + ed.remaining_bytes))
89+
while (ed.i < ed.bytes_read)
9790
{
9891
char const b_in = buffer_in[ed.i];
9992

@@ -131,8 +124,7 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
131124
if (ed.header_line || ed.field < N_FIELDS_SITE_DATA)
132125
{
133126
++ed.i; // adds '\t' or '\n'
134-
std::copy(&buffer_in[ed.b], &buffer_in[ed.i], &buffer_out[ed.o]);
135-
ed.o += (ed.i - ed.b);
127+
std::copy(&buffer_in[ed.b], &buffer_in[ed.i], std::back_inserter(buffer_out));
136128
}
137129
else
138130
{
@@ -145,34 +137,69 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
145137
std::forward_as_tuple(&buffer_in[ed.b], ed.i - ed.b),
146138
std::forward_as_tuple(ed.num_unique_fields)));
147139

140+
long const field_idx = ed.field - N_FIELDS_SITE_DATA;
141+
assert(field_idx == static_cast<long>(ed.field2uid.size()));
142+
148143
if (insert_it.second == true)
149144
{
145+
ed.field2uid.push_back(ed.unique_fields.size());
150146
++ed.num_unique_fields; // unique field
147+
ed.unique_fields.emplace_back(&buffer_in[ed.b], ed.i - ed.b);
151148

152-
// check if it is in the previous line
153-
auto prev_find_it = ed.prev_map_to_unique_fields.find(insert_it.first->first);
149+
assert(ed.num_unique_fields == static_cast<long>(ed.unique_fields.size()));
154150

155-
if (prev_find_it == ed.prev_map_to_unique_fields.end())
151+
if (field_idx < static_cast<long>(ed.prev_field2uid.size()) &&
152+
ed.prev_unique_fields[ed.prev_field2uid[field_idx]] == ed.unique_fields[insert_it.first->second])
156153
{
157-
/* Case 1: Field is unique in the current line and is not in the previous line. */
158-
++ed.i; // adds '\t' or '\n'
159-
std::copy(&buffer_in[ed.b], &buffer_in[ed.i], &buffer_out[ed.o]); // just copy as is
160-
ed.o += (ed.i - ed.b);
154+
/* Case 0: unique and same as above. */
155+
buffer_out.push_back('$');
156+
buffer_out.push_back(buffer_in[ed.i]);
157+
++ed.i;
161158
}
162159
else
163160
{
164-
/* Case 2: Field is unique in the current line but identical to a field in the previous line. */
165-
buffer_out[ed.o++] = '%';
166-
popvcf::to_chars(prev_find_it->second, buffer_out, ed);
167-
buffer_out[ed.o++] = buffer_in[ed.i++]; // write '\t' or '\n'
161+
// check if it is in the previous line
162+
auto prev_find_it = ed.prev_map_to_unique_fields.find(insert_it.first->first);
163+
164+
if (prev_find_it == ed.prev_map_to_unique_fields.end())
165+
{
166+
/* Case 1: Field is unique in the current line and is not in the previous line. */
167+
++ed.i; // adds '\t' or '\n'
168+
std::copy(&buffer_in[ed.b], &buffer_in[ed.i], std::back_inserter(buffer_out)); // just copy as is
169+
}
170+
else
171+
{
172+
/* Case 2: Field is unique in the current line but identical to a field in the previous line. */
173+
buffer_out.push_back('%');
174+
popvcf::to_chars(prev_find_it->second, buffer_out);
175+
buffer_out.push_back(buffer_in[ed.i]); // write '\t' or '\n'
176+
++ed.i;
177+
}
168178
}
169179
}
170180
else
171181
{
172-
/* Case 3: Field is a duplicate in the current line. */
173-
popvcf::to_chars(insert_it.first->second, buffer_out, ed);
174-
buffer_out[ed.o++] = buffer_in[ed.i++]; // write '\t' or '\n'
182+
ed.field2uid.push_back(insert_it.first->second);
183+
184+
if (field_idx < static_cast<long>(ed.prev_field2uid.size()) &&
185+
ed.prev_unique_fields[ed.prev_field2uid[field_idx]] == ed.unique_fields[insert_it.first->second])
186+
{
187+
/* Case 3: Field is not unique and same has the field above. */
188+
buffer_out.push_back('&');
189+
buffer_out.push_back(buffer_in[ed.i]);
190+
++ed.i;
191+
}
192+
else
193+
{
194+
/* Case 4: Field is a duplicate in the current line. */
195+
popvcf::to_chars(insert_it.first->second, buffer_out);
196+
buffer_out.push_back(buffer_in[ed.i]); // write '\t' or '\n'
197+
++ed.i;
198+
}
175199
}
200+
201+
assert((field_idx + 1) == static_cast<long>(ed.field2uid.size()));
202+
assert(ed.field2uid[0] == 0);
176203
}
177204

178205
assert(b_in == buffer_in[ed.i - 1]); // i should have been already incremented here
@@ -186,15 +213,10 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
186213
} // ends inner loop
187214

188215
// copy the remaining data to the beginning of the input buffer
189-
ed.remaining_bytes = ed.i - ed.b;
190-
191-
if (ed.b > 0)
192-
{
193-
std::copy(&buffer_in[ed.b], &buffer_in[ed.b + ed.remaining_bytes], &buffer_in[0]);
194-
ed.b = 0;
195-
}
196-
197-
ed.i = ed.remaining_bytes;
216+
std::copy(&buffer_in[ed.b], &buffer_in[ed.i], &buffer_in[0]);
217+
ed.i = ed.i - ed.b;
218+
ed.b = 0;
219+
ed.bytes_read = ed.i;
198220
}
199221

200222
//! Encode a gzipped file and write to stdout

0 commit comments

Comments
 (0)