Contents
An Example RunΒΆ
Below is an example that uses the test dataset provided with NucKit. The following commands lead the user through each utility to go from unprocessed paired-end sequence data at the read level to identified genomic locations. This type of work flow has several uses in bioinformatics, including the monitoring of foreign DNA integrated into a host geneome. The sequence reads themselves represent the product of an amplification reaction to target the foreign DNA and the flanking host geneomic sequence.
conda activate nuckit
cd {path/to/nuckit}
nuckit demulti -m etc/test.sampleinfo.csv \
--read1 etc/test_data/Undetermined_S0_L001_R1_001.fastq.gz \
--read2 etc/test_data/Undetermined_S0_L001_R2_001.fastq.gz \
--idx1 etc/test_data/Undetermined_S0_L001_I1_001.fastq.gz \
--idx2 etc/test_data/Undetermined_S0_L001_I2_001.fastq.gz \
-o etc/test_output \
--compress
nuckit trim etc/test_output/testSeq-1.R2.fastq.gz \
-o etc/test_output/testSeq-1.R2.trim.fastq.gz \
-l ACATATGACAACTCAATTAAACGCGAGC --leadMismatch 3 \
-r AGATCGGAAGAGCGTCGTGT --overMismatch 4 --overMaxLength 20 \
--compress
nuckit trim etc/test_output/testSeq-1.R1.fastq.gz \
-o etc/test_output/testSeq-1.R1.trim.fastq.gz \
-r GCTCGCGTTTAATTGAGTTGTCATATGT --overMismatch 4 --overMaxLength 20 \
--compress
nuckit filt etc/test_output/testSeq-1.R2.trim.fastq.gz etc/test_output/testSeq-1.R1.trim.fastq.gz \
-o etc/test_output/testSeq-1.R2.filt.fastq etc/test_output/testSeq-1.R1.filt.fastq \
--compress
nuckit consol etc/test_output/testSeq-1.R2.filt.fastq.gz \
-o etc/test_output/testSeq-1.R2.consol.fasta \
-k etc/test_output/testSeq-1.R2.key.csv \
-l testSeq1. \
--compress
nuckit consol etc/test_output/testSeq-1.R1.filt.fastq.gz \
-o etc/test_output/testSeq-1.R1.consol.fasta \
-k etc/test_output/testSeq-1.R1.key.csv \
-l testSeq1. \
--compress
# Make sure you have 'blat' installed and a 2bit copy of the hg38 reference genome.
blat hg38.2bit etc/test_output/testSeq-1.R2.consol.fasta etc/test_output/testSeq-1.R2.psl \
-tileSize=11 -stepSize=9 -minIdentity=85 -maxIntron=5 \
-minScore=27 -dots=1000 -out=psl -noHead
gzip etc/test_output/testSeq-1.R2.psl
blat hg38.2bit etc/test_output/testSeq-1.R1.consol.fasta etc/test_output/testSeq-1.R1.psl\
-tileSize=11 -stepSize=9 -minIdentity=85 -maxIntron=5 \
-minScore=27 -dots=1000 -out=psl -noHead
gzip etc/test_output/testSeq-1.R1.psl
nuckit couple etc/test_data/testSeq-1.R2.psl.gz etc/test_data/testSeq-1.R1.psl.gz \
-k etc/test_output/testSeq-1.R2.key.csv etc/test_output/testSeq-1.R1.key.csv \
-o etc/test_output/testSeq-1.uniq.csv \
--condSites etc/test_output/testSeq-1.cond.csv \
--chimera etc/test_output/testSeq-1.chimera.rds \
--multihit etc/test_output/testSeq-1.multihit.rds \
--refGenome hg38
conda deactivate