## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup, warning=FALSE, message=FALSE--------------------------------------
library(longreadvqs)
library(ggplot2)

#Load sample 1.
sample1 <- system.file("extdata", "sample1.fasta", package = "longreadvqs")

## ----fig0, warning=FALSE, fig.height = 3, fig.width = 7.2, fig.align = "center"----
#Check which % cut-off could effectively minimize noises by assessing % singleton haplotypes.
x <- pctopt(sample1, pctsing = 0, label = "sample1")
ggplot(x, aes(x=pct, y=pctsingleton)) + geom_line() + geom_point()

## ----cmd0, warning=FALSE, message=FALSE---------------------------------------
#VQS diversity metrics of noise minimized (10% cut-off) read alignment
vqssub(sample1, pct = 10, label = "sample1")

## ----cmd1, warning=FALSE, message=FALSE---------------------------------------
#Load samples 2 and 3.
sample2 <- system.file("extdata", "sample2.fasta", package = "longreadvqs")
sample3 <- system.file("extdata", "sample3.fasta", package = "longreadvqs")

#Noise minimization (10% cut-off) and down-sampling (depth of 300)
a <- vqssub(sample1, pct = 10, samsize = 300, label = "sample1")
b <- vqssub(sample2, pct = 10, samsize = 300, label = "sample2")
c <- vqssub(sample3, pct = 10, samsize = 300, label = "sample3")

#Compare VQS diversity across three samples after Noise minimization and down-sampling.
rbind(a, b, c)

## ----cmd2, warning=FALSE, message=FALSE---------------------------------------
#Randomly down-sample from depth of 655 to 300, and 100 (10 iterations for each sample size).
set.seed(123)
c_full <- vqssub(sample3, pct = 10, label = "full_depth") #non-sampled alignment
c_300 <- vqsresub(sample3, iter = 10, pct = 10, 
                  samsize = 300, label = "depth_300") #down-sample to depth of 300
c_100 <- vqsresub(sample3, iter = 10, pct = 10,
                  samsize = 100, label = "depth_100") #down-sample to depth of 100

all_c <- rbind(c_full, c_300, c_100) #combine data

#Load packages for visualization.
library(cowplot)

all_c$depth <- as.character(all_c$depth)
depthorder <- c("655", "300", "100")

#Plot box-plots of three VQS metrics variation through different sampling depths.
haplotypes <- ggplot(all_c, aes(x = factor(depth, level=depthorder), y = haplotypes, 
	group = interaction(depth, label), fill = label)) + geom_boxplot() + 
  xlab("depth") + theme(legend.position = "none")
shannon <- ggplot(all_c, aes(x = factor(depth, level=depthorder), y = shannon, 
	group = interaction(depth, label), fill = label)) + geom_boxplot() + 
  xlab("depth") + theme(legend.position = "none")
gini_simpson <- ggplot(all_c, aes(x = factor(depth, level=depthorder), y = gini_simpson, 
	group = interaction(depth, label), fill = label)) + geom_boxplot() + 
  xlab("depth") + theme(legend.position = "none")

## ----fig1, fig.height = 3, fig.width = 7.2, fig.align = "center"--------------
plot_grid(haplotypes, shannon, gini_simpson, nrow = 1)

## ----cmd3, warning=FALSE, message=FALSE---------------------------------------
#Generate complete VQS data for further comparison for each sample.
s1 <- vqsassess(sample1, pct = 10, samsize = 300, label = "sample1")
s2 <- vqsassess(sample2, pct = 10, samsize = 300, label = "sample2")
s3 <- vqsassess(sample3, pct = 10, samsize = 300, label = "sample3")

## ----fig2, fig.height = 6, fig.width = 7.2, fig.align = "center"--------------
#Compare SNV profile between three samples.
snvcompare(samplelist = list(s1, s2, s3), ncol = 1)

## ----cmd4, warning=FALSE, message=FALSE---------------------------------------
#Load sample 4.
sample4 <- system.file("extdata", "mock.fasta", package = "longreadvqs")
s4 <- vqsassess(sample4, pct = 10, samsize = 300, label = "sample4")

## ----fig3, warning=FALSE, fig.height = 2, fig.width = 7.2, fig.align = "center"----
#SNV profile of sample 4
s4$snv

## ----cmd5, warning=FALSE, message=FALSE---------------------------------------
#Use "vqscustompct" function to increase % cut-off after position 972 to 30%.
s4_fix <- vqscustompct(sample4, pct = 10, brkpos = c("1:971","972:982"), lspct = c(10,30), 
                       samsize = 300, label = "sample4")

## ----fig4, warning=FALSE, fig.height = 2, fig.width = 7.2, fig.align = "center"----
#SNV profile of sample 4 after % cut-off adjustment
s4_fix$snv

## ----cmd6, warning=FALSE, message=FALSE---------------------------------------
#List outputs from "vqsassess" or "vqscustompct" that we want to compare into "vqscompare" function.
#Set the number of new OTU groups based on k-means clustering to 10 groups (kmeans.n = 10).
set.seed(1234)
comp <- vqscompare(samplelist = list(s1, s2, s3, s4_fix),
                   lab_name = "Sample", kmeans.n = 10)

## ----fig5, warning=FALSE, fig.height = 6, fig.width = 7.2, fig.align = "center"----
#The most important output of the "vqscompare" function is the summary plot.
comp$summaryplot