@@ -22,22 +22,24 @@ class EncodeData
22
22
{
23
23
public:
24
24
std::size_t bytes_read{0 };
25
- std::size_t remaining_bytes{0 };
26
25
std::size_t field{0 }; // current vcf field
27
26
std::size_t b{0 }; // begin index in buffer_in
28
27
std::size_t i{b}; // index in buffer_in
29
- std::size_t o{0 }; // output index
30
28
bool header_line{true }; // !< True iff in header line
31
29
bool no_previous_line{false }; // ! Set to skip using previous line
32
30
33
31
std::string prev_contig{};
34
- uint64_t prev_pos{0 };
32
+ int64_t prev_pos{0 };
35
33
uint32_t prev_num_unique_fields{};
34
+ std::vector<std::string> prev_unique_fields{};
35
+ std::vector<uint32_t > prev_field2uid{};
36
36
phmap::flat_hash_map<std::string, uint32_t > prev_map_to_unique_fields{};
37
37
38
38
std::string contig{};
39
- uint64_t pos{0 };
39
+ int64_t pos{0 };
40
40
uint32_t num_unique_fields{};
41
+ std::vector<std::string> unique_fields{};
42
+ std::vector<uint32_t > field2uid{};
41
43
phmap::flat_hash_map<std::string, uint32_t > map_to_unique_fields{};
42
44
43
45
inline void clear_line ()
@@ -49,51 +51,42 @@ class EncodeData
49
51
std::swap (prev_contig, contig);
50
52
prev_pos = pos;
51
53
prev_num_unique_fields = num_unique_fields;
54
+ std::swap (prev_unique_fields, unique_fields);
55
+ std::swap (prev_field2uid, field2uid);
52
56
std::swap (prev_map_to_unique_fields, map_to_unique_fields);
53
57
}
54
58
55
59
contig.resize (0 );
56
60
pos = 0 ;
57
61
num_unique_fields = 0 ;
62
+ unique_fields.resize (0 );
63
+ field2uid.resize (0 );
58
64
map_to_unique_fields.clear (); // clear map every line
59
65
}
60
66
};
61
67
62
68
template <typename Tint, typename Tbuffer_out>
63
- inline void to_chars (Tint char_val, Tbuffer_out & buffer_out, EncodeData & ed )
69
+ inline void to_chars (Tint char_val, Tbuffer_out & buffer_out)
64
70
{
65
71
while (char_val >= CHAR_SET_SIZE)
66
72
{
67
73
auto rem = char_val % CHAR_SET_SIZE;
68
74
char_val = char_val / CHAR_SET_SIZE;
69
- buffer_out[ed. o ++] = int_to_ascii (rem);
75
+ buffer_out. push_back ( int_to_ascii (rem) );
70
76
}
71
77
72
78
assert (char_val < CHAR_SET_SIZE);
73
- buffer_out[ed.o ++] = int_to_ascii (char_val);
74
- }
75
-
76
- template <typename Tbuffer_out>
77
- inline void reserve_space (Tbuffer_out & buffer, long const input_size)
78
- {
79
- buffer.resize (std::max (ENC_BUFFER_SIZE, input_size));
80
- }
81
-
82
- // ! Specialization for array buffers
83
- template <>
84
- inline void reserve_space (Tarray_buf & /* buffer*/ , long const /* input_size*/ )
85
- {
86
- // NOP
79
+ buffer_out.push_back (int_to_ascii (char_val));
87
80
}
88
81
89
82
// ! Encodes an input buffer. Output is written in \a buffer_out.
90
83
template <typename Tbuffer_out, typename Tbuffer_in>
91
84
inline void encode_buffer (Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, EncodeData & ed)
92
85
{
93
- popvcf::reserve_space ( buffer_out, buffer_in. size () );
86
+ buffer_out. reserve (ENC_BUFFER_SIZE / 2 );
94
87
std::size_t constexpr N_FIELDS_SITE_DATA{9 }; // how many fields of the VCF contains site data
95
88
96
- while (ed.i < ( ed.bytes_read + ed. remaining_bytes ) )
89
+ while (ed.i < ed.bytes_read )
97
90
{
98
91
char const b_in = buffer_in[ed.i ];
99
92
@@ -131,8 +124,7 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
131
124
if (ed.header_line || ed.field < N_FIELDS_SITE_DATA)
132
125
{
133
126
++ed.i ; // adds '\t' or '\n'
134
- std::copy (&buffer_in[ed.b ], &buffer_in[ed.i ], &buffer_out[ed.o ]);
135
- ed.o += (ed.i - ed.b );
127
+ std::copy (&buffer_in[ed.b ], &buffer_in[ed.i ], std::back_inserter (buffer_out));
136
128
}
137
129
else
138
130
{
@@ -145,34 +137,69 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
145
137
std::forward_as_tuple (&buffer_in[ed.b ], ed.i - ed.b ),
146
138
std::forward_as_tuple (ed.num_unique_fields )));
147
139
140
+ long const field_idx = ed.field - N_FIELDS_SITE_DATA;
141
+ assert (field_idx == static_cast <long >(ed.field2uid .size ()));
142
+
148
143
if (insert_it.second == true )
149
144
{
145
+ ed.field2uid .push_back (ed.unique_fields .size ());
150
146
++ed.num_unique_fields ; // unique field
147
+ ed.unique_fields .emplace_back (&buffer_in[ed.b ], ed.i - ed.b );
151
148
152
- // check if it is in the previous line
153
- auto prev_find_it = ed.prev_map_to_unique_fields .find (insert_it.first ->first );
149
+ assert (ed.num_unique_fields == static_cast <long >(ed.unique_fields .size ()));
154
150
155
- if (prev_find_it == ed.prev_map_to_unique_fields .end ())
151
+ if (field_idx < static_cast <long >(ed.prev_field2uid .size ()) &&
152
+ ed.prev_unique_fields [ed.prev_field2uid [field_idx]] == ed.unique_fields [insert_it.first ->second ])
156
153
{
157
- /* Case 1: Field is unique in the current line and is not in the previous line . */
158
- ++ed. i ; // adds '\t' or '\n'
159
- std::copy (& buffer_in[ed.b ], &buffer_in[ed. i ], &buffer_out[ed. o ]); // just copy as is
160
- ed. o += ( ed.i - ed. b ) ;
154
+ /* Case 0: unique and same as above . */
155
+ buffer_out. push_back ( ' $ ' );
156
+ buffer_out. push_back ( buffer_in[ed.i ]);
157
+ ++ ed.i ;
161
158
}
162
159
else
163
160
{
164
- /* Case 2: Field is unique in the current line but identical to a field in the previous line. */
165
- buffer_out[ed.o ++] = ' %' ;
166
- popvcf::to_chars (prev_find_it->second , buffer_out, ed);
167
- buffer_out[ed.o ++] = buffer_in[ed.i ++]; // write '\t' or '\n'
161
+ // check if it is in the previous line
162
+ auto prev_find_it = ed.prev_map_to_unique_fields .find (insert_it.first ->first );
163
+
164
+ if (prev_find_it == ed.prev_map_to_unique_fields .end ())
165
+ {
166
+ /* Case 1: Field is unique in the current line and is not in the previous line. */
167
+ ++ed.i ; // adds '\t' or '\n'
168
+ std::copy (&buffer_in[ed.b ], &buffer_in[ed.i ], std::back_inserter (buffer_out)); // just copy as is
169
+ }
170
+ else
171
+ {
172
+ /* Case 2: Field is unique in the current line but identical to a field in the previous line. */
173
+ buffer_out.push_back (' %' );
174
+ popvcf::to_chars (prev_find_it->second , buffer_out);
175
+ buffer_out.push_back (buffer_in[ed.i ]); // write '\t' or '\n'
176
+ ++ed.i ;
177
+ }
168
178
}
169
179
}
170
180
else
171
181
{
172
- /* Case 3: Field is a duplicate in the current line. */
173
- popvcf::to_chars (insert_it.first ->second , buffer_out, ed);
174
- buffer_out[ed.o ++] = buffer_in[ed.i ++]; // write '\t' or '\n'
182
+ ed.field2uid .push_back (insert_it.first ->second );
183
+
184
+ if (field_idx < static_cast <long >(ed.prev_field2uid .size ()) &&
185
+ ed.prev_unique_fields [ed.prev_field2uid [field_idx]] == ed.unique_fields [insert_it.first ->second ])
186
+ {
187
+ /* Case 3: Field is not unique and same has the field above. */
188
+ buffer_out.push_back (' &' );
189
+ buffer_out.push_back (buffer_in[ed.i ]);
190
+ ++ed.i ;
191
+ }
192
+ else
193
+ {
194
+ /* Case 4: Field is a duplicate in the current line. */
195
+ popvcf::to_chars (insert_it.first ->second , buffer_out);
196
+ buffer_out.push_back (buffer_in[ed.i ]); // write '\t' or '\n'
197
+ ++ed.i ;
198
+ }
175
199
}
200
+
201
+ assert ((field_idx + 1 ) == static_cast <long >(ed.field2uid .size ()));
202
+ assert (ed.field2uid [0 ] == 0 );
176
203
}
177
204
178
205
assert (b_in == buffer_in[ed.i - 1 ]); // i should have been already incremented here
@@ -186,15 +213,10 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
186
213
} // ends inner loop
187
214
188
215
// copy the remaining data to the beginning of the input buffer
189
- ed.remaining_bytes = ed.i - ed.b ;
190
-
191
- if (ed.b > 0 )
192
- {
193
- std::copy (&buffer_in[ed.b ], &buffer_in[ed.b + ed.remaining_bytes ], &buffer_in[0 ]);
194
- ed.b = 0 ;
195
- }
196
-
197
- ed.i = ed.remaining_bytes ;
216
+ std::copy (&buffer_in[ed.b ], &buffer_in[ed.i ], &buffer_in[0 ]);
217
+ ed.i = ed.i - ed.b ;
218
+ ed.b = 0 ;
219
+ ed.bytes_read = ed.i ;
198
220
}
199
221
200
222
// ! Encode a gzipped file and write to stdout
0 commit comments