ataudt
diff --git a/‎R/plotting.R‎
Lines changed: 3 additions & 3 deletions b/‎R/plotting.R‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎vignettes/PLOTS/H3K27me3-Adult-rep2.pdf‎
24.2 KB b/‎vignettes/PLOTS/H3K27me3-Adult-rep2.pdf‎
24.2 KB
diff --git a/‎vignettes/PLOTS/H3K27me3-Adult-rep2_binsize1000.pdf‎
24.2 KB b/‎vignettes/PLOTS/H3K27me3-Adult-rep2_binsize1000.pdf‎
24.2 KB
diff --git a/‎vignettes/PLOTS/H3K27me3-Adult-rep2_binsize150.pdf‎
6.89 KB b/‎vignettes/PLOTS/H3K27me3-Adult-rep2_binsize150.pdf‎
6.89 KB
diff --git a/‎vignettes/chromstaR.Rnw‎
Lines changed: 36 additions & 3 deletions b/‎vignettes/chromstaR.Rnw‎
Lines changed: 36 additions & 3 deletions
diff --git a/‎vignettes/chromstaR.pdf‎
689 KB b/‎vignettes/chromstaR.pdf‎
689 KB
@@ -297,15 +297,15 @@ plotHistogram <- function(model, state=NULL, chromosomes=NULL, start=NULL, end=N
     ### Plot the distributions
     if (is.null(state)) {
         ggplt <- ggplt + geom_line(data=df, aes_string(x='x', y='y', col='state'), size=linewidth)
-        ggplt <- ggplt + scale_color_manual(name="components", values=getStateColors(c('unmodified','modified','total')), labels=legend) + theme(legend.justification=c(1,1), legend.position=c(1,1))
+        ggplt <- ggplt + scale_color_manual(name="components", values=getStateColors(c('unmodified','modified','total')), labels=legend) + theme(legend.justification=c(1,1), legend.position=c(0.99,0.99))
     } else {
         if (state=="unmodified") {
             ggplt <- ggplt + geom_line(data=df[df$state=='unmodified',], aes_string(x='x', y='y', col='state'), size=linewidth)
-            ggplt <- ggplt + scale_color_manual(name="components", values=getStateColors(c('unmodified')), labels=legend[1]) + theme(legend.justification=c(1,1), legend.position=c(1,1))
+            ggplt <- ggplt + scale_color_manual(name="components", values=getStateColors(c('unmodified')), labels=legend[1]) + theme(legend.justification=c(1,1), legend.position=c(0.99,0.99))
         }
         if (state=="modified") {
             ggplt <- ggplt + geom_line(data=df[df$state=='modified',], aes_string(x='x', y='y', col='state'), size=linewidth)
-            ggplt <- ggplt + scale_color_manual(name="components", values=getStateColors(c('modified')), labels=legend[2]) + theme(legend.justification=c(1,1), legend.position=c(1,1))
+            ggplt <- ggplt + scale_color_manual(name="components", values=getStateColors(c('modified')), labels=legend[2]) + theme(legend.justification=c(1,1), legend.position=c(0.99,0.99))
         }
     }
 
 
@@ -82,6 +82,8 @@ exportCounts(model, filename=tempfile())
 @
 \end{scriptsize}
 
+\textbf{!! It is important that the distributions are fitted correctly !!} Please check section \ref{sec:FAQ_example_histograms} for examples of how this plot should \emph{not} look like and what can be done to get a correct fit.
+
 \subsection{\label{sec:broad}Task 2: Peak calling for a broad histone modification}
 
 Examples of histone modifications with a broad profile are H3K9me3, H3K27me3, H3K36me3, H4K20me1 in most human tissues. These modifications usually cover broad domains of the genome, and the enrichment is best captured with a bin size between 500bp and 2000bp.
@@ -147,6 +149,8 @@ plotHistogram(model) + ggtitle('H4K20me1')
 @
 \end{scriptsize}
 
+\textbf{!! It is important that the distributions are fitted correctly !!} Please check section \ref{sec:FAQ_example_histograms} for examples of how this plot should \emph{not} look like and what can be done to get a correct fit.
+
 \subsection{Task 3: Peak calling for ATAC-seq, DNase-seq, FAIRE-seq, ...}
 
 Peak calling for ATAC-seq and DNase-seq is similar to the peak calling of a narrow histone modification (section~\ref{sec:narrow}). FAIRE-seq experiments seem to exhibit a broad profile with our model, so the procedure is similar to the domain calling of a broad histone modification (section~\ref{sec:broad}).
@@ -191,7 +195,7 @@ exp <- data.frame(file=files, mark='H3K27me3', condition='SHR', replicate=1:4,
 # We use bin size 1000bp and chromosome 12 to keep the example quick
 binned.data <- list()
 for (file in files) {
-  binned.data[[basename(file)]] <- binReads(file, binsize=1000, stepsizes=500, 
+  binned.data[[basename(file)]] <- binReads(file, binsize=1000, stepsizes=500,
                                          assembly=rn4_chrominfo, chromosomes='chr12',
                                          experiment.table=exp)
 }
@@ -203,9 +207,13 @@ for (file in files) {
 models <- list()
 for (i1 in 1:length(binned.data)) {
   models[[i1]] <- callPeaksUnivariate(binned.data[[i1]], max.time=60)
+  plotHistogram(models[[i1]])
 }
 @
 
+\textbf{!! It is important that the distributions are fitted correctly !!} Please check section \ref{sec:FAQ_example_histograms} for examples of how this plot should \emph{not} look like and what can be done to get a correct fit.\\
+
+
 <<multivariate_replicate_peak_calling, results='markup', message=FALSE, eval=TRUE>>=
 ## === Step 4: Check replicate correlation ===
 # We run a multivariate peak calling on all 4 replicates
@@ -260,7 +268,7 @@ head(rn4_chrominfo)
 #=== Step 2: Run Chromstar ===
 ## Run ChromstaR
 Chromstar(inputfolder, experiment.table=experiment_table_H4K20me1,
-          outputfolder=outputfolder, numCPU=4, binsize=1000, stepsize=500, 
+          outputfolder=outputfolder, numCPU=4, binsize=1000, stepsize=500,
           assembly=rn4_chrominfo, prefit.on.chr='chr12', chromosomes='chr12',
           mode='differential')
 @
@@ -271,6 +279,9 @@ model <- get(load(file.path(outputfolder,'multivariate',
                             'multivariate_mode-differential_mark-H4K20me1.RData')))
 @
 
+\textbf{!! It is important that the distributions in folder outputfolder/PLOTS/univariate-distributions are fitted correctly !!} Please check section \ref{sec:FAQ_example_histograms} for examples of how this plot should \emph{not} look like and what can be done to get a correct fit.\\
+
+
 <<multivariate_differential_stateBrewer, results='markup', message=TRUE, eval=TRUE>>=
 ## === Step 3: Construct differential and common states ===
 diff.states <- stateBrewer(experiment_table_H4K20me1, mode='differential',
@@ -326,6 +337,9 @@ Chromstar(inputfolder, experiment.table=experiment_table_SHR,
           mode='combinatorial')
 @
 
+\textbf{!! It is important that the distributions in folder outputfolder/PLOTS/univariate-distributions are fitted correctly !!} Please check section \ref{sec:FAQ_example_histograms} for examples of how this plot should \emph{not} look like and what can be done to get a correct fit.\\
+
+
 <<multivariate_combinatorial_listfiles, results='markup', message=FALSE, eval=TRUE, fig.width=4, fig.height=3, out.width='0.5\\textwidth'>>=
 ## Results are stored in 'outputfolder' and can be loaded for further processing
 list.files(outputfolder)
@@ -354,6 +368,7 @@ genes <- GRanges(seqnames=paste0('chr',genes$chromosome_name),
                  ranges=IRanges(start=genes$start, end=genes$end),
                  strand=genes$strand,
                  name=genes$external_gene_id, biotype=genes$gene_biotype)
+seqlevels(genes)[seqlevels(genes)=='chrMT'] <- 'chrM'
 print(genes)
 @
 
@@ -411,6 +426,7 @@ expression.SHR <- GRanges(seqnames=paste0('chr',expr$chromosome_name),
                           strand=expr$strand, name=expr$external_gene_id,
                           biotype=expr$gene_biotype,
                           expression=expr$expression_SHR)
+seqlevels(expression.SHR)[seqlevels(expression.SHR)=='chrMT'] <- 'chrM'
 # We apply an asinh transformation to reduce the effect of outliers
 expression.SHR$expression <- asinh(expression.SHR$expression)
 
@@ -472,6 +488,9 @@ model <- get(load(file.path(outputfolder,'combined',
                             'combined_mode-differential.RData')))
 @
 
+\textbf{!! It is important that the distributions in folder outputfolder/PLOTS/univariate-distributions are fitted correctly !!} Please check section \ref{sec:FAQ_example_histograms} for examples of how this plot should \emph{not} look like and what can be done to get a correct fit.\\
+
+
 <<combined_analysis, results='markup', message=FALSE, eval=TRUE>>=
 #=== Step 3: Analysis and export ===
 ## Obtain all genomic regions where the two tissues have different states
@@ -506,6 +525,7 @@ genes <- GRanges(seqnames=paste0('chr',genes$chromosome_name),
                  ranges=IRanges(start=genes$start, end=genes$end),
                  strand=genes$strand,
                  name=genes$external_gene_id, biotype=genes$gene_biotype)
+seqlevels(genes)[seqlevels(genes)=='chrMT'] <- 'chrM'
 print(genes)
 @
 
@@ -529,7 +549,7 @@ plots[['BN']] + facet_wrap(~ mark) +
 tss <- resize(genes, width = 3, fix = 'start')
 biotypes <- split(tss, tss$biotype)
 plots <- plotFoldEnrichHeatmap(model, annotations=biotypes)
-plots[['BN']] + coord_flip() + 
+plots[['BN']] + coord_flip() +
   ggtitle('Fold enrichment with different biotypes')
 @
 \end{scriptsize}
@@ -607,7 +627,20 @@ heatmapCombinations(marks=c('H3K4me3', 'H3K27me3', 'H3K36me3', 'H3K27Ac'))
 @
 \end{scriptsize}
 
+\subsection{\label{sec:FAQ_example_histograms}Examples of problematic distributions.}
+
+For the chromstaR peak calling to work correctly it is essential that the Baum-Welch algorithm correctly identifies unmodified (background) and modified (signal/peak) components in the data. Therefore, you should always check the plots in folder \textbf{PLOTS/univariate-distributions} for correct convergence. Here are some plots that indicate failed and succesful fitting procedures:
+
+% p1 <- ggdraw(p1) + draw_label("WRONG", angle = -45, size = 80, alpha = .2, colour = 'red')
+% p2 <- ggdraw(p2) + draw_label("CORRECT", angle = -45, size = 80, alpha = .2, colour = 'green')
+% cowplt <- plot_grid(p1, p2, labels = letters[1:2])
+% ggsave(cowplt, filename = '~/Bioconductor/chromstaR/vignettes/PLOTS/H3K27me3-Adult-rep2_binsize1000.pdf', width=42, height=15, units='cm')
+
+\includegraphics[width=\textwidth]{PLOTS/H3K27me3-Adult-rep2_binsize1000.pdf}
+The plot shows data for H3K27me3 at binsize 1000bp. (a) Incorrectly converged fit, where the \textbf{modified} component (red) has lower read counts than the \textbf{unmodified} component (gray). (b) Correctly converged fit. Even here, the fit could be improved by reducing the average number of reads per bin, either by selecting a smaller binsize or by downsampling the data before using chromstaR.
 
+\includegraphics[width=\textwidth]{PLOTS/H3K27me3-Adult-rep2_binsize150.pdf}
+The plot shows data for H3K27me3 at binsize 150bp. (a) Incorrectly converged fit, where the \textbf{modified} component (red) has a higher density at zero reads than the \textbf{unmodified} component (gray). (b) Correctly converged fit.
 
 \section{Session Info}
 \begin{scriptsize}
Original file line number	Diff line number	Diff line change
`@@ -297,15 +297,15 @@ plotHistogram <- function(model, state=NULL, chromosomes=NULL, start=NULL, end=N`
`297`	`297`	`### Plot the distributions`
`298`	`298`	`if (is.null(state)) {`
`299`	`299`	`ggplt <- ggplt + geom_line(data=df, aes_string(x='x', y='y', col='state'), size=linewidth)`
`300`		`- ggplt <- ggplt + scale_color_manual(name="components", values=getStateColors(c('unmodified','modified','total')), labels=legend) + theme(legend.justification=c(1,1), legend.position=c(1,1))`
	`300`	`+ ggplt <- ggplt + scale_color_manual(name="components", values=getStateColors(c('unmodified','modified','total')), labels=legend) + theme(legend.justification=c(1,1), legend.position=c(0.99,0.99))`
`301`	`301`	`} else {`
`302`	`302`	`if (state=="unmodified") {`
`303`	`303`	`ggplt <- ggplt + geom_line(data=df[df$state=='unmodified',], aes_string(x='x', y='y', col='state'), size=linewidth)`
`304`		`- ggplt <- ggplt + scale_color_manual(name="components", values=getStateColors(c('unmodified')), labels=legend[1]) + theme(legend.justification=c(1,1), legend.position=c(1,1))`
	`304`	`+ ggplt <- ggplt + scale_color_manual(name="components", values=getStateColors(c('unmodified')), labels=legend[1]) + theme(legend.justification=c(1,1), legend.position=c(0.99,0.99))`
`305`	`305`	`}`
`306`	`306`	`if (state=="modified") {`
`307`	`307`	`ggplt <- ggplt + geom_line(data=df[df$state=='modified',], aes_string(x='x', y='y', col='state'), size=linewidth)`
`308`		`- ggplt <- ggplt + scale_color_manual(name="components", values=getStateColors(c('modified')), labels=legend[2]) + theme(legend.justification=c(1,1), legend.position=c(1,1))`
	`308`	`+ ggplt <- ggplt + scale_color_manual(name="components", values=getStateColors(c('modified')), labels=legend[2]) + theme(legend.justification=c(1,1), legend.position=c(0.99,0.99))`
`309`	`309`	`}`
`310`	`310`	`}`
`311`	`311`