Skip to content

Commit 8ef2db2

Browse files
committed
bumps version, writes popvcf.gz when using popvcf encoding
1 parent e8f0f87 commit 8ef2db2

File tree

11 files changed

+216
-125
lines changed

11 files changed

+216
-125
lines changed

.github/workflows/cancel.yaml

Lines changed: 0 additions & 17 deletions
This file was deleted.

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ example_test.vcf
1010
*.log
1111
.nfs*
1212
*.bak
13+
*.git*.sh
1314

1415
# ggtags
1516
/GPATH

CMakeLists.txt

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ include(CheckIncludeFileCXX)
77
# The version number
88
set (graphtyper_VERSION_MAJOR 2)
99
set (graphtyper_VERSION_MINOR 7)
10-
set (graphtyper_VERSION_PATCH 3)
10+
set (graphtyper_VERSION_PATCH 4)
1111

1212
# Graphtyper's headers
1313
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include")
@@ -302,6 +302,15 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/catch/single_include/)
302302
enable_testing(true)
303303
add_subdirectory(test)
304304

305+
############################################################################
306+
# ARCHIVE
307+
############################################################################
308+
add_custom_target(archive
309+
COMMAND sh -c "bash .git-archive-all.sh --format tar.gz --prefix graphtyper-v${graphtyper_VERSION}/ ${CMAKE_CURRENT_BINARY_DIR}/graphtyper-v${graphtyper_VERSION}.tar.gz"
310+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
311+
COMMENT "Generating an archive which includes submodules."
312+
VERBATIM)
313+
305314
#############################################################################
306315
## CLANG-FORMAT
307316
#############################################################################

include/graphtyper/utilities/bgzf_stream.hpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class BGZF_stream
2525
public:
2626
std::ostringstream ss;
2727
std::string buffer_in;
28-
std::string buffer_out;
28+
std::vector<char> buffer_out;
2929

3030
BGZF_stream() = default;
3131
~BGZF_stream(); // custom
@@ -70,12 +70,11 @@ inline void BGZF_stream::flush()
7070
if (Options::const_instance()->is_on_final_output && Options::const_instance()->encoding == 'p')
7171
{
7272
assert(buffer_in.size() >= ed.i);
73-
ed.bytes_read = buffer_in.size();
7473
popvcf::encode_buffer(buffer_out, buffer_in, ed);
7574

7675
if (fp == nullptr)
7776
{
78-
std::cout << buffer_out; // Write uncompressed to stdout
77+
std::copy(buffer_out.begin(), buffer_out.end(), std::ostream_iterator<char>(std::cout));
7978
}
8079
else if (buffer_out.size() > 0)
8180
{
@@ -94,7 +93,6 @@ inline void BGZF_stream::flush()
9493
}
9594
}
9695

97-
buffer_in.resize(ed.i);
9896
buffer_out.resize(0);
9997
}
10098
else

include/graphtyper/utilities/logging.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,4 +107,14 @@ void print_log(log_severity const severity, args_t &&... args)
107107
*log_singleton->sink << '\n';
108108
}
109109

110+
#ifdef NDEBUG // Release build
111+
# define print_debug(...) ((void)0)
112+
#else // not NDEBUG (=> debug build)
113+
# define print_debug(...) print_log(gyper::log_severity::debug, __VA_ARGS__)
114+
#endif // NDEBUG
115+
116+
#define print_info(...) print_log(gyper::log_severity::info, __VA_ARGS__)
117+
#define print_warning(...) print_log(gyper::log_severity::warning, __VA_ARGS__)
118+
#define print_error(...) print_log(gyper::log_severity::error, __VA_ARGS__)
119+
110120
} // namespace gyper

include/graphtyper/utilities/options.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ class Options
3737
bool uncompressed_sample_names{false};
3838
char encoding{'v'}; // v VCF, p popVCF
3939
bool is_on_final_output{false}; // Set as true before writing out final output
40+
int bgzf_compression_level{9};
4041

4142
/****
4243
* FILTERING OPTIONS

include/popvcf/encode.hpp

Lines changed: 72 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -12,81 +12,83 @@
1212

1313
namespace popvcf
1414
{
15-
//! Buffer size when encoding
16-
long constexpr ENC_BUFFER_SIZE{4 * 65536};
17-
18-
//! Data type of an array buffer
19-
using Tarray_buf = std::array<char, ENC_BUFFER_SIZE>; //!< Buffer type
20-
2115
class EncodeData
2216
{
2317
public:
24-
std::size_t bytes_read{0};
25-
std::size_t field{0}; // current vcf field
26-
std::size_t b{0}; // begin index in buffer_in
27-
std::size_t i{b}; // index in buffer_in
28-
bool header_line{true}; //!< True iff in header line
29-
bool no_previous_line{false}; //! Set to skip using previous line
30-
31-
std::string prev_contig{};
32-
int64_t prev_pos{0};
33-
uint32_t prev_num_unique_fields{};
18+
std::size_t field{0}; //!< current vcf field.
19+
std::size_t in_size{0}; //!< Size of inut buffer.
20+
std::size_t b{0}; //!< begin index in buffer_in
21+
std::size_t i{b}; //!< index in buffer_in
22+
bool header_line{true}; //!< True iff in header line
23+
24+
/* Data fields from previous line. */
3425
std::vector<std::string> prev_unique_fields{};
3526
std::vector<uint32_t> prev_field2uid{};
3627
phmap::flat_hash_map<std::string, uint32_t> prev_map_to_unique_fields{};
3728

29+
/* Data fields from current line. */
3830
std::string contig{};
3931
int64_t pos{0};
40-
uint32_t num_unique_fields{};
32+
int32_t n_alt{-1};
4133
std::vector<std::string> unique_fields{};
4234
std::vector<uint32_t> field2uid{};
4335
phmap::flat_hash_map<std::string, uint32_t> map_to_unique_fields{};
4436

45-
inline void clear_line()
46-
{
47-
field = 0; // reset field index
37+
/* Data fields for the next line. */
38+
std::string next_contig{};
39+
int64_t next_pos{0};
4840

49-
if (not no_previous_line)
41+
inline void clear_line(std::string && next_contig, int64_t next_pos, int32_t next_n_alt)
42+
{
43+
if (next_contig != contig || (next_pos / 10000) != (pos / 10000))
44+
{
45+
/// Previous line is not available, clear values
46+
prev_unique_fields.resize(0);
47+
prev_field2uid.resize(0);
48+
prev_map_to_unique_fields.clear();
49+
}
50+
else if (next_n_alt == n_alt)
5051
{
51-
std::swap(prev_contig, contig);
52-
prev_pos = pos;
53-
prev_num_unique_fields = num_unique_fields;
52+
/// Only swap out from this line if we have the same amount of alts
5453
std::swap(prev_unique_fields, unique_fields);
5554
std::swap(prev_field2uid, field2uid);
5655
std::swap(prev_map_to_unique_fields, map_to_unique_fields);
5756
}
5857

59-
contig.resize(0);
60-
pos = 0;
61-
num_unique_fields = 0;
58+
/// Clear data from this line for the next
59+
contig = std::move(next_contig);
60+
pos = next_pos;
61+
n_alt = next_n_alt;
6262
unique_fields.resize(0);
6363
field2uid.resize(0);
64-
map_to_unique_fields.clear(); // clear map every line
64+
map_to_unique_fields.clear();
6565
}
6666
};
6767

68-
template <typename Tint, typename Tbuffer_out>
69-
inline void to_chars(Tint char_val, Tbuffer_out & buffer_out)
68+
template <typename Tbuffer_in>
69+
inline void set_input_size(Tbuffer_in & buffer_in, EncodeData & ed)
7070
{
71-
while (char_val >= CHAR_SET_SIZE)
72-
{
73-
auto rem = char_val % CHAR_SET_SIZE;
74-
char_val = char_val / CHAR_SET_SIZE;
75-
buffer_out.push_back(int_to_ascii(rem));
76-
}
71+
ed.in_size = buffer_in.size();
72+
}
7773

78-
assert(char_val < CHAR_SET_SIZE);
79-
buffer_out.push_back(int_to_ascii(char_val));
74+
template <>
75+
inline void set_input_size(Tenc_array_buf & /*buffer_in*/, EncodeData & /*ed*/)
76+
{
77+
// Do nothing.
78+
// NOTE: dd.in_size must be set prior to calling decode_buffer in arrays
8079
}
8180

8281
//! Encodes an input buffer. Output is written in \a buffer_out.
8382
template <typename Tbuffer_out, typename Tbuffer_in>
8483
inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, EncodeData & ed)
8584
{
86-
buffer_out.reserve(ENC_BUFFER_SIZE / 2);
85+
set_input_size(buffer_in, ed);
86+
buffer_out.reserve(ENC_BUFFER_SIZE);
8787
std::size_t constexpr N_FIELDS_SITE_DATA{9}; // how many fields of the VCF contains site data
88+
std::string next_contig{};
89+
int64_t next_pos{0};
8890

89-
while (ed.i < ed.bytes_read)
91+
while (ed.i < ed.in_size)
9092
{
9193
char const b_in = buffer_in[ed.i];
9294

@@ -96,35 +98,31 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
9698
continue; // we are in a vcf field
9799
}
98100

99-
if (ed.field == 0)
101+
if (ed.field == 0) /*CHROM field*/
100102
{
101103
// check if in header line and store contig
102-
if (buffer_in[ed.b] == '#')
103-
{
104-
ed.header_line = true;
105-
}
106-
else
107-
{
108-
ed.header_line = false;
109-
ed.contig = std::string(&buffer_in[ed.b], ed.i - ed.b);
110-
}
104+
ed.header_line = buffer_in[ed.b] == '#'; // check if in header line
105+
106+
if (not ed.header_line)
107+
next_contig.assign(&buffer_in[ed.b], ed.i - ed.b);
111108
}
112-
else if (ed.header_line == false && ed.field == 1)
109+
else if (not ed.header_line)
113110
{
114-
std::from_chars(&buffer_in[ed.b], &buffer_in[ed.i], ed.pos);
115-
116-
if (ed.contig != ed.prev_contig || (ed.pos / 10000) != (ed.prev_pos / 10000))
111+
if (ed.field == 1) /*POS field*/
112+
{
113+
std::from_chars(&buffer_in[ed.b], &buffer_in[ed.i], next_pos);
114+
}
115+
else if (ed.field == 4) /*ALT field*/
117116
{
118-
// previous line is not available
119-
ed.prev_num_unique_fields = 0;
120-
ed.prev_map_to_unique_fields.clear();
117+
int32_t next_n_alt = std::count(&buffer_in[ed.b], &buffer_in[ed.i], ',');
118+
ed.clear_line(std::move(next_contig), next_pos, next_n_alt);
121119
}
122120
}
123121

124122
if (ed.header_line || ed.field < N_FIELDS_SITE_DATA)
125123
{
126-
++ed.i; // adds '\t' or '\n'
127-
std::copy(&buffer_in[ed.b], &buffer_in[ed.i], std::back_inserter(buffer_out));
124+
++ed.i; // adds '\t' or '\n' and then insert the field to the output buffer
125+
buffer_out.insert(buffer_out.end(), &buffer_in[ed.b], &buffer_in[ed.i]);
128126
}
129127
else
130128
{
@@ -135,25 +133,25 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
135133
auto insert_it = ed.map_to_unique_fields.insert(
136134
std::pair<std::string, uint32_t>(std::piecewise_construct,
137135
std::forward_as_tuple(&buffer_in[ed.b], ed.i - ed.b),
138-
std::forward_as_tuple(ed.num_unique_fields)));
136+
std::forward_as_tuple(ed.unique_fields.size())));
139137

140138
long const field_idx = ed.field - N_FIELDS_SITE_DATA;
141139
assert(field_idx == static_cast<long>(ed.field2uid.size()));
142140

143141
if (insert_it.second == true)
144142
{
145143
ed.field2uid.push_back(ed.unique_fields.size());
146-
++ed.num_unique_fields; // unique field
147144
ed.unique_fields.emplace_back(&buffer_in[ed.b], ed.i - ed.b);
148145

149-
assert(ed.num_unique_fields == static_cast<long>(ed.unique_fields.size()));
150-
151146
if (field_idx < static_cast<long>(ed.prev_field2uid.size()) &&
152147
ed.prev_unique_fields[ed.prev_field2uid[field_idx]] == ed.unique_fields[insert_it.first->second])
153148
{
154149
/* Case 0: unique and same as above. */
155150
buffer_out.push_back('$');
156-
buffer_out.push_back(buffer_in[ed.i]);
151+
152+
if (b_in == '\n') /* never skip newline */
153+
buffer_out.push_back('\n');
154+
157155
++ed.i;
158156
}
159157
else
@@ -164,8 +162,8 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
164162
if (prev_find_it == ed.prev_map_to_unique_fields.end())
165163
{
166164
/* Case 1: Field is unique in the current line and is not in the previous line. */
167-
++ed.i; // adds '\t' or '\n'
168-
std::copy(&buffer_in[ed.b], &buffer_in[ed.i], std::back_inserter(buffer_out)); // just copy as is
165+
++ed.i; // adds '\t' or '\n'
166+
buffer_out.insert(buffer_out.end(), &buffer_in[ed.b], &buffer_in[ed.i]);
169167
}
170168
else
171169
{
@@ -186,7 +184,10 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
186184
{
187185
/* Case 3: Field is not unique and same has the field above. */
188186
buffer_out.push_back('&');
189-
buffer_out.push_back(buffer_in[ed.i]);
187+
188+
if (b_in == '\n') /* never skip newline */
189+
buffer_out.push_back('\n');
190+
190191
++ed.i;
191192
}
192193
else
@@ -207,7 +208,7 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
207208

208209
// check if we need to clear line or increment field
209210
if (b_in == '\n')
210-
ed.clear_line();
211+
ed.field = 0; // reset field index
211212
else
212213
++ed.field;
213214
} // ends inner loop
@@ -216,7 +217,8 @@ inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, Enco
216217
std::copy(&buffer_in[ed.b], &buffer_in[ed.i], &buffer_in[0]);
217218
ed.i = ed.i - ed.b;
218219
ed.b = 0;
219-
ed.bytes_read = ed.i;
220+
ed.in_size = ed.i;
221+
resize_input_buffer(buffer_in, ed.i);
220222
}
221223

222224
//! Encode a gzipped file and write to stdout
@@ -225,7 +227,6 @@ void encode_file(std::string const & input_fn,
225227
std::string const & output_fn,
226228
std::string const & output_mode,
227229
bool const is_bgzf_output,
228-
int const compression_threads,
229-
bool const no_previous_line);
230+
int const compression_threads);
230231

231232
} // namespace popvcf

0 commit comments

Comments
 (0)