Skip to content

Commit 9d9dfe6

Browse files
authored
Version 2.5 (#34)
* Added option for inputting custom BAM/CRAM index files in the genotype subcommand. * Added advanved option in the 'genotype' called '--normal_and_no_variant_overlapping' which will output both of those things in the VCF output. * Made copying tabix index a warning instead of error. Tweaked how merging of vcf files worked. Fixed a rare bug in graph construction. * Improved performance of creating bgzipped VCF. * Minor bugfix in failed regression test * Removed haplotype impurty filter as it seems to be causing some issues in large cohorts
1 parent a64262e commit 9d9dfe6

30 files changed

+807
-608
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ include(ExternalProject)
55

66
# The version number
77
set (graphtyper_VERSION_MAJOR 2)
8-
set (graphtyper_VERSION_MINOR 4)
8+
set (graphtyper_VERSION_MINOR 5)
99
set (graphtyper_VERSION_PATCH 0)
1010
set(STATIC_DIR "" CACHE STRING "If set, GraphTyper will be built as a static binary using libraries from the given STATIC_DIR.")
1111

include/graphtyper/graph/graph.hpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -172,10 +172,9 @@ class Graph
172172
/**********************
173173
* GRAPH MODIFICATION *
174174
**********************/
175-
void add_reference(unsigned end_pos,
175+
bool add_reference(unsigned end_pos,
176176
unsigned const & num_var,
177-
std::vector<char> const & reference_sequence
178-
);
177+
std::vector<char> const & reference_sequence);
179178

180179
void add_variants(VarRecord && record);
181180

include/graphtyper/graph/haplotype.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ class Haplotype
101101
/*********************
102102
* CLASS INFORMATION *
103103
*********************/
104-
std::vector<uint16_t> get_haplotype_calls(std::vector<double> & haplotype_impurity) const;
104+
std::vector<uint16_t> get_haplotype_calls() const;
105105
uint32_t get_genotype_num() const;
106106
bool has_too_many_genotypes() const;
107107
std::vector<uint32_t> get_genotype_ids() const;

include/graphtyper/graph/haplotype_calls.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#pragma once
2+
23
#include <iostream>
34
#include <vector>
45

@@ -7,6 +8,7 @@
78
#include <graphtyper/graph/haplotype.hpp>
89
#include <graphtyper/typer/vcf_writer.hpp>
910

11+
1012
namespace gyper
1113
{
1214

@@ -18,7 +20,6 @@ class HaplotypeCall
1820
std::vector<uint16_t> calls;
1921
std::vector<Genotype> gts;
2022
std::vector<std::vector<ReadStrand> > read_strand;
21-
std::vector<double> haplotype_impurity;
2223
long num_samples{0};
2324

2425
HaplotypeCall() = default;
@@ -58,6 +59,6 @@ class HaplotypeCalls
5859

5960

6061
void save_calls(HaplotypeCalls & calls, std::string const & filename);
61-
std::vector<HaplotypeCall> load_calls(std::string filename);
62+
std::vector<HaplotypeCall> load_calls(std::string const & filename);
6263

6364
}

include/graphtyper/typer/sample_call.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
#include <cstdint> // uint8_t, uint16_t
44
#include <vector> // std::vector
55

6+
#include <boost/serialization/access.hpp>
7+
8+
69
namespace gyper
710
{
811

@@ -12,6 +15,8 @@ class ReferenceDepth;
1215

1316
class SampleCall
1417
{
18+
friend class boost::serialization::access;
19+
1520
public:
1621
SampleCall() noexcept;
1722
SampleCall(std::vector<uint8_t> && phred,
@@ -36,6 +41,9 @@ class SampleCall
3641
uint8_t get_gq() const;
3742
int8_t check_filter(long gq) const;
3843

44+
private:
45+
template <class Archive>
46+
void serialize(Archive & ar, unsigned int version);
3947

4048
};
4149

include/graphtyper/typer/variant.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
#include <string> // std::string
55
#include <vector> // std::vector
66

7+
#include <boost/serialization/access.hpp>
8+
79
#include <graphtyper/graph/genotype.hpp> // gyper::Genotype
810
#include <graphtyper/graph/haplotype.hpp> // gyper::Haplotype
911
#include <graphtyper/typer/sample_call.hpp> // gyper::SampleCall
@@ -16,6 +18,8 @@ class VariantCandidate;
1618

1719
class Variant
1820
{
21+
friend class boost::serialization::access;
22+
1923
public:
2024
uint32_t abs_pos;
2125
std::vector<std::vector<char> > seqs;
@@ -71,6 +75,10 @@ class Variant
7175
bool operator==(Variant const & b) const;
7276
bool operator!=(Variant const & b) const;
7377
bool operator<(Variant const & b) const;
78+
79+
private:
80+
template <class Archive>
81+
void serialize(Archive & ar, unsigned int version);
7482
};
7583

7684
// Hash function for Variant

include/graphtyper/typer/vcf.hpp

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
#include <unordered_map> // std::unordered_map
66
#include <vector> // std::vector
77

8+
#include <boost/serialization/access.hpp>
9+
810
#include <graphtyper/constants.hpp>
911
#include <graphtyper/graph/genotype.hpp>
1012
#include <graphtyper/graph/haplotype.hpp>
@@ -32,6 +34,8 @@ enum VCF_FILE_MODE
3234

3335
class Vcf
3436
{
37+
friend class boost::serialization::access;
38+
3539
public:
3640
Vcf();
3741
explicit Vcf(VCF_FILE_MODE _filemode, std::string const & filename);
@@ -52,22 +56,21 @@ class Vcf
5256
void read_samples();
5357
bool read_record(bool SITES_ONLY = false);
5458
void read(bool SITES_ONLY = false); /** \brief Reads the VCF file. */
55-
void open_for_writing();
59+
void open_for_writing(long const n_threads = 1);
5660
void write_header();
5761

5862
void write_record(Variant const & var,
5963
std::string const & suffix = "",
60-
const bool FILTER_ZERO_QUAL = false
61-
);
64+
const bool FILTER_ZERO_QUAL = false);
6265

6366
void write_tbi_index() const;
6467
void write_segments();
65-
void write(std::string const & region = "."); /** \brief Writes the VCF file. */
68+
void write(std::string const & region = ".", long const n_threads = 1); /** \brief Writes the VCF file. */
69+
6670
void write_records(uint32_t region_begin,
6771
uint32_t region_end,
6872
bool FILTER_ZERO_QUAL,
69-
std::vector<Variant> const & vars
70-
);
73+
std::vector<Variant> const & vars);
7174

7275
void write_records(std::string const & region = ".", bool FILTER_ZERO_QUAL = false);
7376
void close_vcf_file();
@@ -83,23 +86,24 @@ class Vcf
8386

8487
void add_haplotypes_for_extraction(std::vector<HaplotypeCall> const & hap_calls, bool const is_splitting_vars);
8588

86-
// Modify data
87-
//void post_process_camou_variants(long const ploidy);
88-
89-
//void post_process_variants(bool NORMALIZE = true,
90-
// bool TRIM_SEQUENCES = true
91-
// );
92-
9389
VCF_FILE_MODE filemode;
9490
std::string filename;
9591
std::vector<std::string> sample_names;
9692
std::vector<Variant> variants;
9793
std::vector<Segment> segments;
94+
95+
private:
96+
template <class Archive>
97+
void serialize(Archive & ar, unsigned int version);
9898
};
9999

100100
std::vector<std::size_t> get_all_pos(std::string const & line, char const delim = '\t');
101101
std::string get_string_at_tab_index(std::string const & line,
102102
std::vector<std::size_t> const & tabs,
103103
int index);
104104

105+
void save_vcf(Vcf const & vcf, std::string const & filename);
106+
void load_vcf(Vcf & vcf, std::string const & filename, long n_batch);
107+
bool append_vcf(Vcf & vcf, std::string const & filename, long n_batch);
108+
105109
} // namespace gyper

include/graphtyper/typer/vcf_operations.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,11 @@ void vcf_concatenate(std::vector<std::string> const & vcfs,
1717

1818
void vcf_break_down(std::string const & vcf, std::string const & output, std::string const & region);
1919

20-
void vcf_merge_and_break(std::vector<std::string> & vcfs,
20+
void vcf_merge_and_break(std::vector<std::string> const & vcfs,
2121
std::string const & output,
2222
std::string const & region,
23-
bool const FILTER_ZERO_QUAL);
23+
bool const FILTER_ZERO_QUAL,
24+
bool const force_no_variant_overlap);
2425

2526
void vcf_update_info(std::string const & vcf, std::string const & output);
2627

include/graphtyper/utilities/bamshrink.hpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,20 +33,21 @@ namespace gyper
3333
{
3434

3535
void
36-
bamshrink(std::string const & chrom,
36+
bamshrink(std::string const chrom,
3737
int begin,
3838
int end,
39-
std::string const & path_in,
40-
std::string const & path_out,
39+
std::string const path_in,
40+
std::string const sam_index_in,
41+
std::string const path_out,
4142
double const avg_cov_by_readlen,
42-
std::string const & ref_fn);
43+
std::string const ref_fn);
4344

4445

4546
void
46-
bamshrink_multi(std::string const & interval_fn,
47-
std::string const & path_in,
48-
std::string const & path_out,
47+
bamshrink_multi(std::string const interval_fn,
48+
std::string const path_in,
49+
std::string const path_out,
4950
double const avg_cov_by_readlen,
50-
std::string const & ref_fn);
51+
std::string const ref_fn);
5152

5253
} // namespace gyper

include/graphtyper/utilities/bgzf_stream.hpp

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19,45 +19,46 @@ class BGZF_stream
1919
{
2020
private:
2121
BGZF * fp = nullptr;
22-
std::ostringstream ss;
23-
2422

2523
public:
24+
std::ostringstream ss;
25+
2626
BGZF_stream() = default;
27-
BGZF_stream(std::string const & filename, std::string const & filemode);
28-
~BGZF_stream();
27+
~BGZF_stream(); // custom
28+
BGZF_stream(BGZF_stream const &) = delete;
29+
BGZF_stream(BGZF_stream &&) = delete;
30+
BGZF_stream & operator=(BGZF_stream const &) = delete;
31+
BGZF_stream & operator=(BGZF_stream &&) = delete;
2932

3033
template <class T>
3134
BGZF_stream & operator<<(T const & x);
35+
void check_cache();
3236
void flush();
33-
void open(std::string const & filename, std::string const & filemode);
37+
void open(std::string const & filename, std::string const & filemode, long const n_threads);
3438
bool is_open() const;
3539
void close(); // Close BGZF file
3640

37-
long MAX_CACHE_SIZE = 10000000ll;
41+
long MAX_CACHE_SIZE{1000000ll};
3842
};
3943

4044

41-
inline
42-
BGZF_stream::BGZF_stream(std::string const & filename, std::string const & filemode)
43-
{
44-
open(filename, filemode);
45-
}
46-
47-
4845
template <class T>
4946
inline
5047
BGZF_stream &
5148
BGZF_stream::operator<<(T const & x)
5249
{
5350
// Add to stringstream
5451
ss << x;
52+
return *this;
53+
}
54+
5555

56-
// Check if we should flush
56+
inline
57+
void
58+
BGZF_stream::check_cache()
59+
{
5760
if (static_cast<long>(ss.tellp()) > this->MAX_CACHE_SIZE)
5861
flush();
59-
60-
return *this;
6162
}
6263

6364

@@ -72,7 +73,7 @@ BGZF_stream::flush()
7273
}
7374
else
7475
{
75-
std::string const & str = ss.str();
76+
std::string str = ss.str();
7677
int ret = bgzf_write(fp, str.data(), str.size());
7778

7879
if (ret < 0)
@@ -89,14 +90,17 @@ BGZF_stream::flush()
8990

9091
inline
9192
void
92-
BGZF_stream::open(std::string const & filename, std::string const & filemode)
93+
BGZF_stream::open(std::string const & filename, std::string const & filemode, long const n_threads)
9394
{
9495
if (fp)
9596
close();
9697

9798
if (filename.size() > 0 && filename != "-")
9899
{
99100
fp = bgzf_open(filename.c_str(), filemode.c_str());
101+
102+
if (n_threads > 1)
103+
bgzf_mt(fp, n_threads, 256);
100104
}
101105
}
102106

include/graphtyper/utilities/genotype.hpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,24 +17,27 @@ namespace gyper
1717

1818
std::vector<std::string>
1919
run_bamshrink(std::vector<std::string> const & sams,
20+
std::vector<std::string> const & sams_index,
2021
std::string const & ref_fn,
2122
GenomicRegion const & region,
2223
std::vector<double> const & avg_cov_by_readlen,
2324
std::string const & tmp);
2425

26+
// bamshrink variant for multiple regions
2527
std::vector<std::string>
26-
run_bamshrink(std::vector<std::string> const & sams,
27-
std::string const & ref_fn,
28-
std::string const & interval_fn,
29-
std::vector<double> const & avg_cov_by_readlen,
30-
std::string const & tmp);
28+
run_bamshrink_multi(std::vector<std::string> const & sams,
29+
std::string const & ref_fn,
30+
std::string const & interval_fn,
31+
std::vector<double> const & avg_cov_by_readlen,
32+
std::string const & tmp);
3133

3234
void
3335
run_samtools_merge(std::vector<std::string> & shrinked_sams, std::string const & tmp);
3436

3537
void
3638
genotype(std::string ref_path,
3739
std::vector<std::string> const & sams,
40+
std::vector<std::string> const & sams_index,
3841
GenomicRegion const & region,
3942
std::string const & output_path,
4043
std::vector<double> const & avg_cov_by_readlen,
@@ -44,6 +47,7 @@ genotype(std::string ref_path,
4447
void
4548
genotype_regions(std::string const & ref_path,
4649
std::vector<std::string> const & sams,
50+
std::vector<std::string> const & sams_index,
4751
std::vector<GenomicRegion> const & regions,
4852
std::string const & output_path,
4953
std::vector<double> const & avg_cov_by_readlen,

include/graphtyper/utilities/options.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ class Options
2929
bool no_decompose{false};
3030
bool no_bamshrink{false};
3131
bool no_variant_overlapping{false};
32+
bool normal_and_no_variant_overlapping{false};
3233
bool is_all_biallelic{false};
3334
bool is_only_cigar_discovery{false};
3435
bool is_discovery_only_for_paired_reads{false};
@@ -94,6 +95,8 @@ class Options
9495
long genotype_dis_min_support{8};
9596
double genotype_dis_min_support_ratio{0.30};
9697

98+
// internal vcf options
99+
long num_alleles_in_batch{500};
97100

98101
/********************************
99102
* HAPLOTYPE EXTRACTION OPTIONS *

0 commit comments

Comments
 (0)