12
12
13
13
namespace popvcf
14
14
{
15
- // ! Buffer size when encoding
16
- long constexpr ENC_BUFFER_SIZE{4 * 65536 };
17
-
18
- // ! Data type of an array buffer
19
- using Tarray_buf = std::array<char , ENC_BUFFER_SIZE>; // !< Buffer type
20
-
21
15
class EncodeData
22
16
{
23
17
public:
24
- std::size_t bytes_read{0 };
25
- std::size_t field{0 }; // current vcf field
26
- std::size_t b{0 }; // begin index in buffer_in
27
- std::size_t i{b}; // index in buffer_in
28
- bool header_line{true }; // !< True iff in header line
29
- bool no_previous_line{false }; // ! Set to skip using previous line
30
-
31
- std::string prev_contig{};
32
- int64_t prev_pos{0 };
33
- uint32_t prev_num_unique_fields{};
18
+ std::size_t field{0 }; // !< current vcf field.
19
+ std::size_t in_size{0 }; // !< Size of inut buffer.
20
+ std::size_t b{0 }; // !< begin index in buffer_in
21
+ std::size_t i{b}; // !< index in buffer_in
22
+ bool header_line{true }; // !< True iff in header line
23
+
24
+ /* Data fields from previous line. */
34
25
std::vector<std::string> prev_unique_fields{};
35
26
std::vector<uint32_t > prev_field2uid{};
36
27
phmap::flat_hash_map<std::string, uint32_t > prev_map_to_unique_fields{};
37
28
29
+ /* Data fields from current line. */
38
30
std::string contig{};
39
31
int64_t pos{0 };
40
- uint32_t num_unique_fields{ };
32
+ int32_t n_alt{- 1 };
41
33
std::vector<std::string> unique_fields{};
42
34
std::vector<uint32_t > field2uid{};
43
35
phmap::flat_hash_map<std::string, uint32_t > map_to_unique_fields{};
44
36
45
- inline void clear_line ()
46
- {
47
- field = 0 ; // reset field index
37
+ /* Data fields for the next line. */
38
+ std::string next_contig{};
39
+ int64_t next_pos{ 0 };
48
40
49
- if (not no_previous_line)
41
+ inline void clear_line (std::string && next_contig, int64_t next_pos, int32_t next_n_alt)
42
+ {
43
+ if (next_contig != contig || (next_pos / 10000 ) != (pos / 10000 ))
44
+ {
45
+ // / Previous line is not available, clear values
46
+ prev_unique_fields.resize (0 );
47
+ prev_field2uid.resize (0 );
48
+ prev_map_to_unique_fields.clear ();
49
+ }
50
+ else if (next_n_alt == n_alt)
50
51
{
51
- std::swap (prev_contig, contig);
52
- prev_pos = pos;
53
- prev_num_unique_fields = num_unique_fields;
52
+ // / Only swap out from this line if we have the same amount of alts
54
53
std::swap (prev_unique_fields, unique_fields);
55
54
std::swap (prev_field2uid, field2uid);
56
55
std::swap (prev_map_to_unique_fields, map_to_unique_fields);
57
56
}
58
57
59
- contig.resize (0 );
60
- pos = 0 ;
61
- num_unique_fields = 0 ;
58
+ // / Clear data from this line for the next
59
+ contig = std::move (next_contig);
60
+ pos = next_pos;
61
+ n_alt = next_n_alt;
62
62
unique_fields.resize (0 );
63
63
field2uid.resize (0 );
64
- map_to_unique_fields.clear (); // clear map every line
64
+ map_to_unique_fields.clear ();
65
65
}
66
66
};
67
67
68
- template <typename Tint, typename Tbuffer_out >
69
- inline void to_chars (Tint char_val, Tbuffer_out & buffer_out )
68
+ template <typename Tbuffer_in >
69
+ inline void set_input_size (Tbuffer_in & buffer_in, EncodeData & ed )
70
70
{
71
- while (char_val >= CHAR_SET_SIZE)
72
- {
73
- auto rem = char_val % CHAR_SET_SIZE;
74
- char_val = char_val / CHAR_SET_SIZE;
75
- buffer_out.push_back (int_to_ascii (rem));
76
- }
71
+ ed.in_size = buffer_in.size ();
72
+ }
77
73
78
- assert (char_val < CHAR_SET_SIZE);
79
- buffer_out.push_back (int_to_ascii (char_val));
74
+ template <>
75
+ inline void set_input_size (Tenc_array_buf & /* buffer_in*/ , EncodeData & /* ed*/ )
76
+ {
77
+ // Do nothing.
78
+ // NOTE: dd.in_size must be set prior to calling decode_buffer in arrays
80
79
}
81
80
82
81
// ! Encodes an input buffer. Output is written in \a buffer_out.
83
82
template <typename Tbuffer_out, typename Tbuffer_in>
84
83
inline void encode_buffer (Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, EncodeData & ed)
85
84
{
86
- buffer_out.reserve (ENC_BUFFER_SIZE / 2 );
85
+ set_input_size (buffer_in, ed);
86
+ buffer_out.reserve (ENC_BUFFER_SIZE);
87
87
std::size_t constexpr N_FIELDS_SITE_DATA{9 }; // how many fields of the VCF contains site data
88
+ std::string next_contig{};
89
+ int64_t next_pos{0 };
88
90
89
- while (ed.i < ed.bytes_read )
91
+ while (ed.i < ed.in_size )
90
92
{
91
93
char const b_in = buffer_in[ed.i ];
92
94
@@ -96,35 +98,31 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
96
98
continue ; // we are in a vcf field
97
99
}
98
100
99
- if (ed.field == 0 )
101
+ if (ed.field == 0 ) /* CHROM field */
100
102
{
101
103
// check if in header line and store contig
102
- if (buffer_in[ed.b ] == ' #' )
103
- {
104
- ed.header_line = true ;
105
- }
106
- else
107
- {
108
- ed.header_line = false ;
109
- ed.contig = std::string (&buffer_in[ed.b ], ed.i - ed.b );
110
- }
104
+ ed.header_line = buffer_in[ed.b ] == ' #' ; // check if in header line
105
+
106
+ if (not ed.header_line )
107
+ next_contig.assign (&buffer_in[ed.b ], ed.i - ed.b );
111
108
}
112
- else if (ed. header_line == false && ed.field == 1 )
109
+ else if (not ed.header_line )
113
110
{
114
- std::from_chars (&buffer_in[ed.b ], &buffer_in[ed.i ], ed.pos );
115
-
116
- if (ed.contig != ed.prev_contig || (ed.pos / 10000 ) != (ed.prev_pos / 10000 ))
111
+ if (ed.field == 1 ) /* POS field*/
112
+ {
113
+ std::from_chars (&buffer_in[ed.b ], &buffer_in[ed.i ], next_pos);
114
+ }
115
+ else if (ed.field == 4 ) /* ALT field*/
117
116
{
118
- // previous line is not available
119
- ed.prev_num_unique_fields = 0 ;
120
- ed.prev_map_to_unique_fields .clear ();
117
+ int32_t next_n_alt = std::count (&buffer_in[ed.b ], &buffer_in[ed.i ], ' ,' );
118
+ ed.clear_line (std::move (next_contig), next_pos, next_n_alt);
121
119
}
122
120
}
123
121
124
122
if (ed.header_line || ed.field < N_FIELDS_SITE_DATA)
125
123
{
126
- ++ed.i ; // adds '\t' or '\n'
127
- std::copy ( &buffer_in[ed.b ], &buffer_in[ed.i ], std::back_inserter (buffer_out) );
124
+ ++ed.i ; // adds '\t' or '\n' and then insert the field to the output buffer
125
+ buffer_out. insert (buffer_out. end (), &buffer_in[ed.b ], &buffer_in[ed.i ]);
128
126
}
129
127
else
130
128
{
@@ -135,25 +133,25 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
135
133
auto insert_it = ed.map_to_unique_fields .insert (
136
134
std::pair<std::string, uint32_t >(std::piecewise_construct,
137
135
std::forward_as_tuple (&buffer_in[ed.b ], ed.i - ed.b ),
138
- std::forward_as_tuple (ed.num_unique_fields )));
136
+ std::forward_as_tuple (ed.unique_fields . size () )));
139
137
140
138
long const field_idx = ed.field - N_FIELDS_SITE_DATA;
141
139
assert (field_idx == static_cast <long >(ed.field2uid .size ()));
142
140
143
141
if (insert_it.second == true )
144
142
{
145
143
ed.field2uid .push_back (ed.unique_fields .size ());
146
- ++ed.num_unique_fields ; // unique field
147
144
ed.unique_fields .emplace_back (&buffer_in[ed.b ], ed.i - ed.b );
148
145
149
- assert (ed.num_unique_fields == static_cast <long >(ed.unique_fields .size ()));
150
-
151
146
if (field_idx < static_cast <long >(ed.prev_field2uid .size ()) &&
152
147
ed.prev_unique_fields [ed.prev_field2uid [field_idx]] == ed.unique_fields [insert_it.first ->second ])
153
148
{
154
149
/* Case 0: unique and same as above. */
155
150
buffer_out.push_back (' $' );
156
- buffer_out.push_back (buffer_in[ed.i ]);
151
+
152
+ if (b_in == ' \n ' ) /* never skip newline */
153
+ buffer_out.push_back (' \n ' );
154
+
157
155
++ed.i ;
158
156
}
159
157
else
@@ -164,8 +162,8 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
164
162
if (prev_find_it == ed.prev_map_to_unique_fields .end ())
165
163
{
166
164
/* Case 1: Field is unique in the current line and is not in the previous line. */
167
- ++ed.i ; // adds '\t' or '\n'
168
- std::copy ( &buffer_in[ed.b ], &buffer_in[ed.i ], std::back_inserter (buffer_out)); // just copy as is
165
+ ++ed.i ; // adds '\t' or '\n'
166
+ buffer_out. insert (buffer_out. end (), &buffer_in[ed.b ], &buffer_in[ed.i ]);
169
167
}
170
168
else
171
169
{
@@ -186,7 +184,10 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
186
184
{
187
185
/* Case 3: Field is not unique and same has the field above. */
188
186
buffer_out.push_back (' &' );
189
- buffer_out.push_back (buffer_in[ed.i ]);
187
+
188
+ if (b_in == ' \n ' ) /* never skip newline */
189
+ buffer_out.push_back (' \n ' );
190
+
190
191
++ed.i ;
191
192
}
192
193
else
@@ -207,7 +208,7 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
207
208
208
209
// check if we need to clear line or increment field
209
210
if (b_in == ' \n ' )
210
- ed.clear_line ();
211
+ ed.field = 0 ; // reset field index
211
212
else
212
213
++ed.field ;
213
214
} // ends inner loop
@@ -216,7 +217,8 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
216
217
std::copy (&buffer_in[ed.b ], &buffer_in[ed.i ], &buffer_in[0 ]);
217
218
ed.i = ed.i - ed.b ;
218
219
ed.b = 0 ;
219
- ed.bytes_read = ed.i ;
220
+ ed.in_size = ed.i ;
221
+ resize_input_buffer (buffer_in, ed.i );
220
222
}
221
223
222
224
// ! Encode a gzipped file and write to stdout
@@ -225,7 +227,6 @@ void encode_file(std::string const & input_fn,
225
227
std::string const & output_fn,
226
228
std::string const & output_mode,
227
229
bool const is_bgzf_output,
228
- int const compression_threads,
229
- bool const no_previous_line);
230
+ int const compression_threads);
230
231
231
232
} // namespace popvcf
0 commit comments