Read FastQC data into R.
qc_read(file, modules = "all", verbose = TRUE)
Path to the file to be imported. Can be the path to either :
the fastqc zipped file (e.g.: 'path/to/samplename_fastqc.zip'). No need to unzip,
or the unzipped folder name (e.g.: 'path/to/samplename_fastqc'),
or the sample name (e.g.: 'path/to/samplename' )
or the fastqc_data.txt file,
Character vector containing the names of FastQC modules for which you want to import/inspect the data. Default is all. Allowed values include one or the combination of:
"Summary",
"Basic Statistics",
"Per base sequence quality",
"Per tile sequence quality",
"Per sequence quality scores",
"Per base sequence content",
"Per sequence GC content",
"Per base N content",
"Sequence Length Distribution",
"Sequence Duplication Levels",
"Overrepresented sequences",
"Adapter Content",
"Kmer Content"
Partial match of module names allowed. For example, you can use modules = "GC content", instead of the full names modules = "Per sequence GC content".
logical value. If TRUE, print filename when reading.
Returns a list of tibbles containing the data for specified modules.
# Demo file
qc.file <- system.file("fastqc_results", "S1_fastqc.zip", package = "fastqcr")
qc.file
#> [1] "/private/var/folders/xm/8p6yj4bj6s57n4v_51714lwm0000gp/T/RtmpT6jSz8/temp_libpatha9b37e9f6eab/fastqcr/fastqc_results/S1_fastqc.zip"
# Read all modules
qc_read(qc.file)
#> Reading: /private/var/folders/xm/8p6yj4bj6s57n4v_51714lwm0000gp/T/RtmpT6jSz8/temp_libpatha9b37e9f6eab/fastqcr/fastqc_results/S1_fastqc.zip
#> Warning: Missing column names filled in: 'X1' [1]
#> Warning: Missing column names filled in: 'X1' [1]
#> $summary
#> # A tibble: 12 × 3
#> status module sample
#> <chr> <chr> <chr>
#> 1 PASS Basic Statistics S1.fastq
#> 2 PASS Per base sequence quality S1.fastq
#> 3 PASS Per tile sequence quality S1.fastq
#> 4 PASS Per sequence quality scores S1.fastq
#> 5 FAIL Per base sequence content S1.fastq
#> 6 WARN Per sequence GC content S1.fastq
#> 7 PASS Per base N content S1.fastq
#> 8 WARN Sequence Length Distribution S1.fastq
#> 9 PASS Sequence Duplication Levels S1.fastq
#> 10 PASS Overrepresented sequences S1.fastq
#> 11 PASS Adapter Content S1.fastq
#> 12 PASS Kmer Content S1.fastq
#>
#> $basic_statistics
#> # A tibble: 7 × 2
#> Measure Value
#> <chr> <chr>
#> 1 Filename S1.fastq
#> 2 File type Conventional base calls
#> 3 Encoding Sanger / Illumina 1.9
#> 4 Total Sequences 50299587
#> 5 Sequences flagged as poor quality 0
#> 6 Sequence length 35-76
#> 7 %GC 48
#>
#> $per_base_sequence_quality
#> # A tibble: 43 × 7
#> Base Mean Median `Lower Quartile` `Upper Quartile` 10th Percentil…¹ 90th …²
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 31.2 32 32 32 32 32
#> 2 2 31.5 32 32 32 32 32
#> 3 3 31.7 32 32 32 32 32
#> 4 4 31.7 32 32 32 32 32
#> 5 5 31.7 32 32 32 32 32
#> 6 6 35.3 36 36 36 36 36
#> 7 7 35.3 36 36 36 36 36
#> 8 8 35.3 36 36 36 36 36
#> 9 9 35.3 36 36 36 36 36
#> 10 10-11 35.3 36 36 36 36 36
#> # … with 33 more rows, and abbreviated variable names ¹`10th Percentile`,
#> # ²`90th Percentile`
#>
#> $per_tile_sequence_quality
#> # A tibble: 18,576 × 3
#> Tile Base Mean
#> <dbl> <chr> <dbl>
#> 1 11101 1 0.175
#> 2 11101 2 0.0478
#> 3 11101 3 0.0668
#> 4 11101 4 0.0558
#> 5 11101 5 0.0485
#> 6 11101 6 0.0194
#> 7 11101 7 0.104
#> 8 11101 8 0.0629
#> 9 11101 9 0.103
#> 10 11101 10-11 0.0580
#> # … with 18,566 more rows
#>
#> $per_sequence_quality_scores
#> # A tibble: 34 × 2
#> Quality Count
#> <dbl> <dbl>
#> 1 2 75
#> 2 3 0
#> 3 4 0
#> 4 5 0
#> 5 6 0
#> 6 7 0
#> 7 8 0
#> 8 9 0
#> 9 10 0
#> 10 11 0
#> # … with 24 more rows
#>
#> $per_base_sequence_content
#> # A tibble: 43 × 5
#> Base G A T C
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 1 24.1 27.4 24.5 24.0
#> 2 2 23.5 27.2 25.5 23.8
#> 3 3 23.2 25.8 26.3 24.7
#> 4 4 23.5 25.9 26.2 24.3
#> 5 5 23.7 26.3 26.1 23.9
#> 6 6 24.3 25.4 25.6 24.7
#> 7 7 24.1 25.7 26.1 24.1
#> 8 8 23.5 25.8 26.2 24.5
#> 9 9 23.5 25.6 26.4 24.5
#> 10 10-11 23.6 25.9 26.4 24.1
#> # … with 33 more rows
#>
#> $per_sequence_gc_content
#> # A tibble: 101 × 2
#> `GC Content` Count
#> <dbl> <dbl>
#> 1 0 81
#> 2 1 44
#> 3 2 14
#> 4 3 39.5
#> 5 4 58
#> 6 5 78.5
#> 7 6 143
#> 8 7 264.
#> 9 8 342.
#> 10 9 428.
#> # … with 91 more rows
#>
#> $per_base_n_content
#> # A tibble: 43 × 2
#> Base `N-Count`
#> <chr> <dbl>
#> 1 1 0.0634
#> 2 2 0.000310
#> 3 3 0.000270
#> 4 4 0.000153
#> 5 5 0.000149
#> 6 6 0.00938
#> 7 7 0.00256
#> 8 8 0.000260
#> 9 9 0.000282
#> 10 10-11 0.000604
#> # … with 33 more rows
#>
#> $sequence_length_distribution
#> # A tibble: 42 × 2
#> Length Count
#> <dbl> <dbl>
#> 1 35 1282
#> 2 36 144
#> 3 37 160
#> 4 38 172
#> 5 39 177
#> 6 40 164
#> 7 41 174
#> 8 42 183
#> 9 43 167
#> 10 44 198
#> # … with 32 more rows
#>
#> $sequence_duplication_levels
#> # A tibble: 16 × 3
#> `Duplication Level` `Percentage of deduplicated` `Percentage of total`
#> <chr> <dbl> <dbl>
#> 1 1 83.8 69.4
#> 2 2 12.7 21.1
#> 3 3 2.63 6.53
#> 4 4 0.591 1.96
#> 5 5 0.152 0.629
#> 6 6 0.0400 0.199
#> 7 7 0.0128 0.0744
#> 8 8 0.00532 0.0352
#> 9 9 0.00243 0.0181
#> 10 >10 0.00393 0.0415
#> 11 >50 0.0000298 0.00218
#> 12 >100 0.0000247 0.00524
#> 13 >500 0.00000300 0.00202
#> 14 >1k 0.00000266 0.00260
#> 15 >5k 0 0
#> 16 >10k+ 0.00000241 0.0565
#>
#> $overrepresented_sequences
#> # A tibble: 0 × 1
#> # … with 1 variable: X1 <chr>
#>
#> $adapter_content
#> # A tibble: 64 × 5
#> Position `Illumina Universal Adapter` Illumina Small RNA Ad…¹ Nexte…² SOLID…³
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.00000994 0.00000199 1.39e-5 0
#> 2 2 0.0000119 0.00000199 2.58e-5 1.99e-6
#> 3 3 0.0000179 0.00000398 3.58e-5 5.96e-6
#> 4 4 0.0000258 0.00000398 4.37e-5 5.96e-6
#> 5 5 0.0000338 0.00000398 4.97e-5 5.96e-6
#> 6 6 0.0000378 0.00000398 5.96e-5 7.95e-6
#> 7 7 0.0000437 0.00000398 6.76e-5 7.95e-6
#> 8 8 0.0000537 0.00000398 7.36e-5 9.94e-6
#> 9 9 0.0000557 0.00000398 8.35e-5 9.94e-6
#> 10 10 0.0000557 0.00000398 9.54e-5 9.94e-6
#> # … with 54 more rows, and abbreviated variable names
#> # ¹`Illumina Small RNA Adapter`, ²`Nextera Transposase Sequence`,
#> # ³`SOLID Small RNA Adapter`
#>
#> $kmer_content
#> # A tibble: 0 × 1
#> # … with 1 variable: X1 <chr>
#>
#> $total_deduplicated_percentage
#> [1] 82.76
#>
#> attr(,"class")
#> [1] "list" "qc_read"
# Read a specified module
qc_read(qc.file,"Per base sequence quality")
#> Reading: /private/var/folders/xm/8p6yj4bj6s57n4v_51714lwm0000gp/T/RtmpT6jSz8/temp_libpatha9b37e9f6eab/fastqcr/fastqc_results/S1_fastqc.zip
#> $per_base_sequence_quality
#> # A tibble: 43 × 7
#> Base Mean Median `Lower Quartile` `Upper Quartile` 10th Percentil…¹ 90th …²
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 31.2 32 32 32 32 32
#> 2 2 31.5 32 32 32 32 32
#> 3 3 31.7 32 32 32 32 32
#> 4 4 31.7 32 32 32 32 32
#> 5 5 31.7 32 32 32 32 32
#> 6 6 35.3 36 36 36 36 36
#> 7 7 35.3 36 36 36 36 36
#> 8 8 35.3 36 36 36 36 36
#> 9 9 35.3 36 36 36 36 36
#> 10 10-11 35.3 36 36 36 36 36
#> # … with 33 more rows, and abbreviated variable names ¹`10th Percentile`,
#> # ²`90th Percentile`
#>
#> attr(,"class")
#> [1] "list" "qc_read"