% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/vs_fastx_subsample.R
\name{vs_fastx_subsample}
\alias{vs_fastx_subsample}
\alias{vs_fastq_subsample}
\alias{vs_fasta_subsample}
\alias{vs_subsample}
\alias{fastx_subsample}
\alias{subsample}
\title{Subsample sequences}
\usage{
vs_fastx_subsample(
  fastx_input,
  output_format = "fastq",
  fastx_output = NULL,
  sample_pct = NULL,
  sample_size = NULL,
  sizein = TRUE,
  sizeout = TRUE,
  relabel = NULL,
  relabel_sha1 = FALSE,
  randseed = NULL,
  fasta_width = 0,
  sample = NULL,
  threads = 1,
  vsearch_options = NULL,
  tmpdir = NULL
)
}
\arguments{
\item{fastx_input}{(Required). A FASTA/FASTQ file path or FASTA/FASTQ object.
See \emph{Details}.}

\item{output_format}{(Optional). Desired output format of file or tibble:
\code{"fasta"} or \code{"fastq"} (default). If \code{fastx_input} is a FASTA
file path or a FASTA object, \code{output_format} cannot be \code{"fastq"}.}

\item{fastx_output}{(Optional). Name of the output file for subsampled reads
from \code{fastx_input}. File can be in either FASTA or FASTQ format,
depending on \code{output_format}. If \code{NULL} (default), no sequences are
written to file. See \emph{Details}.}

\item{sample_pct}{(Optional). Percentage of the input sequences to be
subsampled. Numeric value ranging from \code{0.0} to \code{100.0}. Defaults
to \code{NULL}.}

\item{sample_size}{(Optional). The given number of sequences to extract. Must
be a positive integer if specified. Defaults to \code{NULL}.}

\item{sizein}{(Optional). If \code{TRUE} (default), abundance annotations
present in sequence headers are taken into account.}

\item{sizeout}{(Optional). If \code{TRUE} (default), abundance annotations
are added to FASTA headers.}

\item{relabel}{(Optional). Relabel sequences using the given prefix and a
ticker to construct new headers. Defaults to \code{NULL}.}

\item{relabel_sha1}{(Optional). If \code{TRUE} (default), relabel sequences
using the SHA1 message digest algorithm. Defaults to \code{FALSE}.}

\item{randseed}{(Optional). Random seed. Must be a positive integer. A given
seed always produces the same output, which is useful for replicability.
Defaults to \code{NULL}.}

\item{fasta_width}{(Optional). Number of characters per line in the output
FASTA file. Defaults to \code{0}, which eliminates wrapping.}

\item{sample}{(Optional). Add the given sample identifier string to sequence
headers. For instance, if the given string is "ABC", the text ";sample=ABC"
will be added to the header. If \code{NULL} (default), no identifier is added.}

\item{threads}{(Optional). Number of computational threads to be used by
\code{VSEARCH}.Defaults to \code{1}.}

\item{vsearch_options}{Additional arguments to pass to \code{VSEARCH}.
Defaults to \code{NULL}. See \emph{Details}.}

\item{tmpdir}{(Optional). Path to the directory where temporary files should
be written when tables are used as input or output. Defaults to
\code{NULL}, which resolves to the session-specific temporary directory
(\code{tempdir()}).}
}
\value{
A tibble or \code{NULL}.

If \code{fastx_output} is specified, the subsampled sequences are written to
the specified output file, and no tibble is returned.

If \code{fastx_output} \code{NULL}, a tibble containing the subsampled reads
in the format specified by \code{output_format} is returned.
}
\description{
\code{vs_fastx_subsample} subsamples sequences in FASTA/FASTQ
file or object by randomly extracting sequences based on number or percentage
using \code{VSEARCH}.
}
\details{
Sequences in the input file/object (\code{fastx_input}) are subsampled by
randomly extracting a specified number or percentage of sequences. Extraction
is performed as random sampling with a uniform distribution among the input
sequences and without replacement.

\code{fastx_input} can either be a FASTA/FASTQ file or a FASTA/FASTQ object.
FASTA objects are tibbles that contain the columns \code{Header} and
\code{Sequence}, see \code{\link[microseq]{readFasta}}. FASTQ objects are
tibbles that contain the columns \code{Header}, \code{Sequence}, and
\code{Quality}, see \code{\link[microseq]{readFastq}}.

Specify either \code{sample_size} or \code{sample_pct} to determine the
number or percentage of sequences to subsample. Only one of these parameters
can be specified at a time. If neither is specified, an error is thrown.

If \code{fastx_output} is specified, the sampled sequences are output to this
file in format given by \code{output_format}.
If \code{fastx_output} is \code{NULL}, the sample sequences are returned as a
FASTA or FASTQ object, depending on \code{output_format}.

\code{vsearch_options} allows users to pass additional command-line arguments
to \code{VSEARCH} that are not directly supported by this function. Refer to
the \code{VSEARCH} manual for more details.
}
\examples{
\dontrun{
# Define arguments
fastx_input <- file.path(file.path(path.package("Rsearch"), "extdata"),
                         "small_R1.fq")
fastx_output <- NULL
output_format <- "fastq"
sample_size <- 10

# Subsample sequences and return a FASTQ tibble
subsample_R1 <- vs_fastx_subsample(fastx_input = fastx_input,
                                   fastx_output = fastx_output,
                                   output_format = output_format,
                                   sample_size = sample_size)

# Subsample sequences and write subsampled sequences to a file
vs_fastx_subsample(fastx_input = fastx_input,
                   fastx_output = "subsample.fq",
                   output_format = output_format,
                   sample_size = sample_size)
}

}
\references{
\url{https://github.com/torognes/vsearch}
}
