Skip to content

Commit d8d0167

Browse files
authored
Version 2.5.1 (#38)
* Bugfix in some rare sv graph construction * Bugfix in somecases of VCF merging in * Made failed copy of tabix index a warning in genotype_sv instead of error
1 parent 9d9dfe6 commit d8d0167

File tree

10 files changed

+167
-67
lines changed

10 files changed

+167
-67
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ include(ExternalProject)
66
# The version number
77
set (graphtyper_VERSION_MAJOR 2)
88
set (graphtyper_VERSION_MINOR 5)
9-
set (graphtyper_VERSION_PATCH 0)
9+
set (graphtyper_VERSION_PATCH 1)
1010
set(STATIC_DIR "" CACHE STRING "If set, GraphTyper will be built as a static binary using libraries from the given STATIC_DIR.")
1111

1212
# Get the current working branch

include/graphtyper/graph/graph.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,9 +172,10 @@ class Graph
172172
/**********************
173173
* GRAPH MODIFICATION *
174174
**********************/
175-
bool add_reference(unsigned end_pos,
175+
void add_reference(unsigned end_pos,
176176
unsigned const & num_var,
177-
std::vector<char> const & reference_sequence);
177+
std::vector<char> const & reference_sequence
178+
);
178179

179180
void add_variants(VarRecord && record);
180181

include/graphtyper/typer/vcf_operations.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ void vcf_merge_and_break(std::vector<std::string> const & vcfs,
2121
std::string const & output,
2222
std::string const & region,
2323
bool const FILTER_ZERO_QUAL,
24-
bool const force_no_variant_overlap);
24+
bool const force_no_variant_overlap,
25+
bool const force_no_break_down);
2526

2627
void vcf_update_info(std::string const & vcf, std::string const & output);
2728

src/graph/graph.cpp

Lines changed: 63 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,15 @@ Graph::add_genomic_region(std::vector<char> && reference_sequence,
8787
var_records.end()
8888
);
8989

90+
for (long v = 0; v < static_cast<long>(var_records.size()); ++v)
91+
{
92+
if (var_records[v].pos >= genomic_region.end)
93+
{
94+
var_records.resize(v);
95+
break;
96+
}
97+
}
98+
9099
if (Options::instance()->add_all_variants)
91100
{
92101
BOOST_LOG_TRIVIAL(debug) << "[" << __HERE__ << "] Constructing graph of "
@@ -157,7 +166,7 @@ Graph::add_genomic_region(std::vector<char> && reference_sequence,
157166
}
158167
else
159168
{
160-
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Constructing graph of "
169+
BOOST_LOG_TRIVIAL(debug) << "[" << __HERE__ << "] Constructing graph of "
161170
<< var_records.size()
162171
<< " variants.";
163172

@@ -199,6 +208,46 @@ Graph::add_genomic_region(std::vector<char> && reference_sequence,
199208
continue;
200209
}
201210

211+
std::vector<char> suffix = var_records[i].get_common_suffix();
212+
213+
//if (var_records[i + 1].pos >= var_records[i].pos + var_records[i].ref.size() - suffix.size())
214+
//{
215+
// long const suffix_to_remove = var_records[i].pos + var_records[i].ref.size() - var_records[i + 1].pos;
216+
//
217+
// // Check if it is possible to remove enough suffix so we can safely join the two records
218+
// if (suffix_to_remove <= 0 || suffix_to_remove > static_cast<long>(suffix.size()))
219+
// {
220+
// var_records[i + 1].merge_one_path(std::move(var_records[i]));
221+
// }
222+
// else
223+
// {
224+
// // Erase from previous record so that it ends where the next record starts
225+
// assert(static_cast<long>(var_records[i].ref.size()) > suffix_to_remove);
226+
// var_records[i].ref.erase(var_records[i].ref.end() - suffix_to_remove, var_records[i].ref.end());
227+
//
228+
// for (auto & alt : var_records[i].alts)
229+
// {
230+
// assert(static_cast<long>(alt.size()) > suffix_to_remove);
231+
// alt.erase(alt.end() - suffix_to_remove, alt.end());
232+
// }
233+
//
234+
// assert(var_records[i + 1].pos == var_records[i].pos + var_records[i].ref.size());
235+
//
236+
// // Merge the two records
237+
// long const suffix_to_add = suffix_to_remove - static_cast<long>(var_records[i + 1].ref.size());
238+
//
239+
// var_records[i + 1].merge(std::move(var_records[i]));
240+
//
241+
// if (suffix_to_add > 0)
242+
// {
243+
// var_records[i + 1].ref.insert(var_records[i + 1].ref.end(), suffix.end() - suffix_to_add, suffix.end());
244+
//
245+
// for (auto & alt : var_records[i + 1].alts)
246+
// alt.insert(alt.end(), suffix.end() - suffix_to_add, suffix.end());
247+
// }
248+
// }
249+
//}
250+
//else
202251
{
203252
// In a few extreme scenarios we cannot merge only one path here.
204253
//BOOST_LOG_TRIVIAL(fatal) << "i : " << var_records[i].to_string();
@@ -217,10 +266,6 @@ Graph::add_genomic_region(std::vector<char> && reference_sequence,
217266
++i;
218267
} // while
219268
}
220-
221-
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Done merging overlapping variant records. Now I have "
222-
<< var_records.size()
223-
<< " variants.";
224269
}
225270

226271
// Erase alternatives which are identical to the reference sequence
@@ -252,7 +297,6 @@ Graph::add_genomic_region(std::vector<char> && reference_sequence,
252297

253298
if (suffix.size() > 0)
254299
{
255-
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Removing commong suffix.";
256300
assert(suffix.size() < var_record.ref.size());
257301
var_record.ref.erase(var_record.ref.begin() + (var_record.ref.size() - suffix.size()), var_record.ref.end());
258302
assert(var_record.ref.size() > 0);
@@ -265,33 +309,22 @@ Graph::add_genomic_region(std::vector<char> && reference_sequence,
265309
}
266310
}
267311

268-
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Adding reference and variants to the graph.";
269-
bool is_continue{true};
270-
271-
for (long i = 0; i < static_cast<long>(var_records.size()); ++i)
312+
// Add reference and variants to the graph
313+
for (unsigned i = 0; i < var_records.size(); ++i)
272314
{
273-
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " i = " << i << ", pos = " << var_records[i].pos;
274-
is_continue = add_reference(var_records[i].pos,
275-
static_cast<unsigned>(var_records[i].alts.size()) + 1u,
276-
reference_sequence); // force in the first iteration
315+
add_reference(var_records[i].pos,
316+
static_cast<unsigned>(var_records[i].alts.size()) + 1u,
317+
reference_sequence);
277318

278-
if (is_continue)
279-
{
280-
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " adding variant";
281-
add_variants(std::move(var_records[i]));
282-
}
319+
add_variants(std::move(var_records[i]));
283320
}
284321

285-
if (is_continue)
286-
{
287-
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Adding final reference sequence behind the last variant.";
288-
add_reference(static_cast<uint32_t>(reference_sequence.size()) + genomic_region.begin, 0, reference_sequence);
289-
}
322+
// Add final reference sequence behind the last variant
323+
add_reference(static_cast<uint32_t>(reference_sequence.size()) + genomic_region.begin, 0, reference_sequence);
290324

291325
// If we chose to use absolute positions we need to change all labels
292326
if (use_absolute_positions)
293327
{
294-
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Updating labels.";
295328
uint32_t const offset = genomic_region.get_absolute_position(1);
296329
unsigned r = 0;
297330
assert(r < ref_nodes.size());
@@ -572,10 +605,8 @@ Graph::add_variants(VarRecord && record)
572605
}
573606

574607

575-
bool
576-
Graph::add_reference(unsigned end_pos,
577-
unsigned const & num_var,
578-
std::vector<char> const & reference_sequence)
608+
void
609+
Graph::add_reference(unsigned end_pos, unsigned const & num_var, std::vector<char> const & reference_sequence)
579610
{
580611
if (end_pos > reference_sequence.size() + genomic_region.begin)
581612
{
@@ -591,13 +622,13 @@ Graph::add_reference(unsigned end_pos,
591622
}
592623

593624
// Make sure end_pos is larger or equal to start_pos
594-
bool is_continue = start_pos <= end_pos;
625+
end_pos = std::max(start_pos, end_pos);
626+
595627
assert(start_pos >= genomic_region.begin);
596628

597629
auto start_it = (start_pos - genomic_region.begin) < reference_sequence.size() ?
598630
reference_sequence.begin() + (start_pos - genomic_region.begin) :
599631
reference_sequence.end();
600-
601632
auto end_it = (end_pos - genomic_region.begin) < reference_sequence.size() ?
602633
reference_sequence.begin() + (end_pos - genomic_region.begin) :
603634
reference_sequence.end();
@@ -607,11 +638,10 @@ Graph::add_reference(unsigned end_pos,
607638
// Create a vector of indexes
608639
std::vector<TNodeIndex> var_indexes(num_var);
609640

610-
for (long i = 0; i < static_cast<long>(num_var); ++i)
641+
for (unsigned i = 0; i < num_var; ++i)
611642
var_indexes[i] = i + var_nodes.size();
612643

613644
ref_nodes.push_back(RefNode(Label(start_pos, std::move(current_dna), 0), std::move(var_indexes)));
614-
return is_continue;
615645
}
616646

617647

src/typer/vcf.cpp

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1666,17 +1666,32 @@ save_vcf(Vcf const & vcf, std::string const & filename)
16661666
void
16671667
load_vcf(Vcf & vcf, std::string const & filename, long n_batch)
16681668
{
1669-
std::string batch_filename = filename + "_" + std::to_string(n_batch);
1670-
std::ifstream ifs(batch_filename.c_str(), std::ios::binary);
1669+
std::string const VCF_GZ = ".vcf.gz";
1670+
1671+
if (filename.size() > VCF_GZ.size() &&
1672+
std::equal(filename.rbegin(),
1673+
filename.rbegin() + VCF_GZ.size(),
1674+
VCF_GZ.rbegin()))
16711675

1672-
if (!ifs.is_open())
16731676
{
1674-
BOOST_LOG_TRIVIAL(error) << "Could not open VCF file " << batch_filename;
1675-
std::exit(1);
1677+
vcf.open(READ_MODE, filename);
1678+
vcf.read();
16761679
}
1680+
else
1681+
{
1682+
std::string batch_filename = filename + "_" + std::to_string(n_batch);
1683+
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Loading variants from " << batch_filename;
1684+
std::ifstream ifs(batch_filename.c_str(), std::ios::binary);
16771685

1678-
boost::archive::binary_iarchive ia(ifs);
1679-
ia >> vcf;
1686+
if (!ifs.is_open())
1687+
{
1688+
BOOST_LOG_TRIVIAL(error) << __HERE__ << " Could not open file " << batch_filename;
1689+
std::exit(1);
1690+
}
1691+
1692+
boost::archive::binary_iarchive ia(ifs);
1693+
ia >> vcf;
1694+
}
16801695
}
16811696

16821697

@@ -1685,6 +1700,7 @@ append_vcf(Vcf & vcf, std::string const & filename, long n_batch)
16851700
{
16861701
Vcf new_vcf;
16871702
std::string batch_filename = filename + "_" + std::to_string(n_batch);
1703+
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Appending variants from " << batch_filename;
16881704
std::ifstream ifs(batch_filename.c_str(), std::ios::binary);
16891705

16901706
if (!ifs.is_open())

src/typer/vcf_operations.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,8 @@ vcf_merge_and_break(std::vector<std::string> const & vcfs,
288288
std::string const & output,
289289
std::string const & region,
290290
bool const FILTER_ZERO_QUAL,
291-
bool const force_no_variant_overlapping)
291+
bool const force_no_variant_overlapping,
292+
bool const force_no_break_down)
292293
{
293294
if (vcfs.size() == 0)
294295
return;
@@ -323,6 +324,7 @@ vcf_merge_and_break(std::vector<std::string> const & vcfs,
323324
for (long i = 1; i < static_cast<long>(vcfs.size()); ++i)
324325
{
325326
auto const & vcf_fn = vcfs[i];
327+
assert((i - 1) < static_cast<long>(next_vcfs.size()));
326328
gyper::Vcf & next_vcf = next_vcfs[i - 1];
327329
load_vcf(next_vcf, vcf_fn, 0);
328330

@@ -423,7 +425,7 @@ vcf_merge_and_break(std::vector<std::string> const & vcfs,
423425
}
424426

425427
// Trigger read next batch
426-
if ((v - v_next) == static_cast<long>(next_vcfs[0].variants.size()))
428+
if (next_vcfs.size() > 0 && (v - v_next) == static_cast<long>(next_vcfs[0].variants.size()))
427429
{
428430
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Updating next_vcfs";
429431

@@ -530,10 +532,15 @@ vcf_merge_and_break(std::vector<std::string> const & vcfs,
530532
force_no_variant_overlapping};
531533

532534
bool const is_all_biallelic{copts.is_all_biallelic};
533-
std::vector<Variant> new_variants = break_down_variant(std::move(var),
534-
reach,
535-
is_no_variant_overlapping,
536-
is_all_biallelic);
535+
std::vector<Variant> new_variants;
536+
537+
if (force_no_break_down)
538+
new_variants.push_back(std::move(var));
539+
else
540+
new_variants = break_down_variant(std::move(var),
541+
reach,
542+
is_no_variant_overlapping,
543+
is_all_biallelic);
537544
assert(new_variants.size() > 0);
538545

539546
for (auto & new_var : new_variants)

src/utilities/genotype.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -324,8 +324,8 @@ genotype_only_with_a_vcf(std::string const & ref_path,
324324
//for (auto & path : paths)
325325
// path += "_calls.vcf.gz";
326326

327-
//> FILTER_ZERO_QUAL, force_no_variant_overlapping
328-
vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", region.to_string(), true, false);
327+
//> FILTER_ZERO_QUAL, force_no_variant_overlapping, force_no_break_down
328+
vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", region.to_string(), true, false, false);
329329

330330
// free memory
331331
graph = Graph();
@@ -652,12 +652,17 @@ genotype(std::string ref_path,
652652
// path += "_calls.vcf.gz";
653653

654654
//> FILTER_ZERO_QUAL, force_no_variant_overlapping
655-
vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", region.to_string(), true, false);
655+
vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", region.to_string(), true, false, false);
656656

657657
if (copts.normal_and_no_variant_overlapping)
658658
{
659659
//> FILTER_ZERO_QUAL, force_no_variant_overlapping
660-
vcf_merge_and_break(paths, tmp + "/graphtyper.no_variant_overlapping.vcf.gz", region.to_string(), true, true);
660+
vcf_merge_and_break(paths,
661+
tmp + "/graphtyper.no_variant_overlapping.vcf.gz",
662+
region.to_string(),
663+
true,
664+
true,
665+
false);
661666
}
662667
}
663668

src/utilities/genotype_camou.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ genotype_camou(std::string const & interval_fn,
256256
// path += "_calls.vcf.gz";
257257

258258
//> FILTER_ZERO_QUAL, force_no_variant_overlapping
259-
vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", genomic_region.to_string(), false, false);
259+
vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", genomic_region.to_string(), false, false, false);
260260
}
261261

262262
auto copy_camou_vcf_to_system =

src/utilities/genotype_sv.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -151,12 +151,12 @@ genotype_sv(std::string ref_path,
151151

152152
// VCF merge
153153
{
154-
//// Append _calls.vcf.gz
154+
// Append _calls.vcf.gz
155155
//for (auto & path : paths)
156156
// path += "_calls.vcf.gz";
157157

158158
//> FILTER_ZERO_QUAL, force_no_variant_overlapping
159-
vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", genomic_region.to_string(), false, false);
159+
vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", genomic_region.to_string(), false, false, true);
160160
}
161161
}
162162

@@ -176,8 +176,17 @@ genotype_sv(std::string ref_path,
176176

177177
if (ret != 0)
178178
{
179-
BOOST_LOG_TRIVIAL(error) << "This command failed '" << ss_cmd.str() << "'";
180-
std::exit(ret);
179+
if (extension.size() == 0)
180+
{
181+
// Error if copying final VCF
182+
BOOST_LOG_TRIVIAL(error) << "This command failed '" << ss_cmd.str() << "'";
183+
std::exit(ret);
184+
}
185+
else
186+
{
187+
// Otherwise a warning
188+
BOOST_LOG_TRIVIAL(warning) << "This command failed '" << ss_cmd.str() << "'";
189+
}
181190
}
182191
};
183192

0 commit comments

Comments
 (0)