Identify the Phred scale of quality scores used in fastQ

From BITS wiki
Jump to: navigation, search


[ Main_Page ]


Scan fastQ data and determine which scoring scale is used

Choosing the right quality score format is mandatory to produce results. FastQ come in different flavours described here ([1]). The following Perl script will scan a number of reads from a FastQ file and match quality scores with the expected ranges. It will return the scales compatible with your data and help you identify the fastq format you have inherited. A Cheat-Sheet is available [here] to translate various quality characters to error probabilities.

different 'phed' scales used in NGS reads (http://en.wikipedia.org/wiki/FASTQ_format)

fastq_phread-base.png

Perl script used to diagnose Phred type from fastq data

The code hidden below will take fastq data as input (even compressed) and determine which formats match the score code used. A limit can be set to the number of lines to process.

# use first 1000 records
fastq_detect.pl fastq.file 1000

fastq_detect.pl

#!/usr/bin/perl -w
 
# http://wiki.bits.vib.be/index.php/Identify_the_Phred_scale_of_quality_scores_used_in_fastQ
 
use strict;
use File::Basename;
use List::MoreUtils qw( minmax );
 
# fastq_detect.pl fastq.file sample-size
# detect fastQ format from quality scores in fastQ input file
# Version 3
#
# Stephane Plaisance - VIB-BITS - July-04-2012 
# Joachim Jacob - Aug-02-2012 - joachim.jacob@gmail.com
# - changed the maximum value of Sanger to 73
# - changed reading the file with a file handle 
#   (was a file handle !! supporting several archive formats. SP)
# - changed the diagnosing algoritm
# Stephane Plaisance - VIB-BITS - April-08-2013 
# - merged both versions and corrected flaw in min/max
# thanks to Sergey Mitrfanov for perl reformatting
 
#####################################################################
# diagnose
#   SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
#   ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
#   ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
#   .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
#   LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
#   !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
#   |                         |    |        |                              |                     |
#  33                        59   64       73                            104                   126
# S 0........................26...31.......40                               
# X                          -5....0........9.............................40
# I                                0........9.............................40
# J                                   3.....9.............................40
# L 0.2......................26...31........41                              
# 
#  S - Sanger        Phred+33,  raw reads typically (0, 40)
#  X - Solexa        Solexa+64, raw reads typically (-5, 40)
#  I - Illumina 1.3+ Phred+64,  raw reads typically (0, 40)
#  J - Illumina 1.5+ Phred+64,  raw reads typically (3, 40) with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
#  L - Illumina 1.8+ Phred+33,  raw reads typically (0, 41)
#####################################################################
 
my $script = basename($0);
 
@ARGV gt 0 or die "usage: $script <fastq file> <opt:sample-size (100)>\n";
my ($inputfile, $limit) = @ARGV;
if (! defined $limit) { $limit = 100}; # check first 100 records
 
my $cnt=0;
my ($min, $max); # global min and max values
 
print STDERR "\n## Analysing ".$limit." records from $inputfile ... \n";
my $z = ReadFile ($inputfile) || die "Error: cannot read from variant file $inputfile: $!\n";
 
## parse
while (my $id = <$z>) {
	$id =~ m/^@/ || die "expected @ not found in line 1!\n";
	my $seq = <$z>;
	my $sep = <$z>;
	$sep =~ m/^\+/ || die "expected + not found in line 3!\n";
	my $qual = <$z>;
	chomp($qual);
	$cnt++;
	$cnt>=$limit && last;
 
	# char to ascii
	my @chars = split("", $qual);
	my @nums = sort { $a <=> $b } (map { unpack("C*", $_ )} @chars);
 
	if ($cnt==1) {
		($min, $max) = minmax @nums;
	} else {
		my ($lmin, $lmax) = minmax @nums; # local values for this read
		$lmin<$min ? $min=$lmin : $min=$min;
		$lmax>$max ? $max=$lmax : $max=$max;
	}
}
 
undef $z;
 
## diagnose
my %diag=(
			'Sanger'		=> '.',
			'Solexa'		=> '.',
			'Illumina 1.3+'	=> '.',
			'Illumina 1.5+'	=> '.',
			'Illumina 1.8+'	=> '.',
			);
 
my %comment=(
			'Sanger'		=> 'Phred+33,  Q[33; 73],  (0, 40)',
			'Solexa'		=> 'Solexa+64, Q[59; 104], (-5, 40)',
			'Illumina 1.3+'	=> 'Phred+64,  Q[64; 104], (0, 40)',
			'Illumina 1.5+'	=> 'Phred+64,  Q[66; 104], (3, 40), with 0=N/A, 1=N/A, 2=Read Segment Quality Control Indicator',
			'Illumina 1.8+'	=> 'Phred+33,  Q[33; 74],  (0, 41)',
			);
 
if ($min<33 || $max>104) { die "Quality values corrupt. found [$min; $max] where [33; 104] was expected\n"; }
if ($min>=33 && $max<=73)  {$diag{'Sanger'}='x';}
if ($min>=59 && $max<=104) {$diag{'Solexa'}='x';}
if ($min>=64 && $max<=104) {$diag{'Illumina 1.3+'}='x';}
if ($min>=66 && $max<=104) {$diag{'Illumina 1.5+'}='x';}
if ($min>=33 && $max<=74)  {$diag{'Illumina 1.8+'}='x';}
 
## report
print STDERR "# sampled raw quality values are in the range of [".$min."; ".$max."]\n";
print STDERR "# format(s) marked below with 'x' agree with this range\n";
 
foreach my $format (sort keys %diag) {
	print STDERR sprintf("  %-13s : %2s  [%-30s] \n", $format, $diag{$format}, $comment{$format});
}
 
 
##############
#### Subs ####
 
# reads from uncompressed, gzipped and bgzip fastQ files
sub ReadFile {
	my $infile = shift;
	my $FH;
	if ($infile =~ /.bz2$/) {
		open ($FH, "bzcat $infile |") or die ("$!: can't open file $infile");
	} elsif ($infile =~ /.gz$/) {
		open ($FH, "zcat $infile |") or die ("$!: can't open file $infile");
	} elsif ($infile =~ /.fq|.fastq|.txt$/) {
		open ($FH, "cat $infile |") or die ("$!: can't open file $infile");
	} else {
		die ("$!: do not recognise file type $infile");
	}
	return $FH;
}

Python script by 'Brent Pedersen' to do the same

This script was found on github[2]. The code should be saved to a local file named 'guess-encoding.py', the file made executable, and called as follows

Handicon.png This code runs faster than its perl counterpart and the result can be used to 'grep out' the first-found matching type and use it in another command.

# if compressed, decompress with zcat and pipe
# use first 1000 records
awk 'NR % 4 == 0' your.fastq | guess-encoding.py -n 1000

guess-encoding.py

#!/usr/bin/env python
 
import sys
import optparse
 
RANGES = {
    'Sanger': (33, 73),
    'Illumina-1.8': (33, 74),
    'Solexa': (59, 104),
    'Illumina-1.3': (64, 104),
    'Illumina-1.5': (67, 104)
}
 
def get_qual_range(qual_str):
    """
    >>> get_qual_range("DLXYXXRXWYYTPMLUUQWTXTRSXSWMDMTRNDNSMJFJFFRMV")
    (68, 89)
    """
 
    vals = [ord(c) for c in qual_str]
    return min(vals), max(vals)
 
def get_encodings_in_range(rmin, rmax, ranges=RANGES):
    valid_encodings = []
    for encoding, (emin, emax) in ranges.items():
        if rmin >= emin and rmax <= emax:
            valid_encodings.append(encoding)
    return valid_encodings
 
def main():
    p = optparse.OptionParser(__doc__)
    p.add_option("-n", dest="n", help="number of qual lines to test default:-1"
                 " means test until end of file or until it it possible to "
                 " determine a single file-type",
                 type='int', default=-1)
 
    opts, args = p.parse_args()
    print >>sys.stderr, "# reading qualities from stdin"
    gmin, gmax  = 99, 0
    valid = []
    for i, line in enumerate(sys.stdin):
        lmin, lmax = get_qual_range(line.rstrip())
        if lmin < gmin or lmax > gmax:
            gmin, gmax = min(lmin, gmin), max(lmax, gmax)
            valid = get_encodings_in_range(gmin, gmax)
            if len(valid) == 0:
                print >>sys.stderr, "no encodings for range: %s" % str((gmin, gmax))
                sys.exit()
            if len(valid) == 1 and opts.n == -1:
                print "\t".join(valid) + "\t" + str((gmin, gmax))
                sys.exit()
 
        if opts.n > 0 and i > opts.n:
            print "\t".join(valid) + "\t" + str((gmin, gmax))
            sys.exit()
 
    print "\t".join(valid) + "\t" + str((gmin, gmax))
 
 
if __name__ == "__main__":
    import doctest
    if doctest.testmod(optionflags=doctest.ELLIPSIS |\
                                   doctest.NORMALIZE_WHITESPACE).failed == 0:
        main()

References:
  1. http://en.wikipedia.org/wiki/FASTQ_format
  2. https://github.com/brentp/bio-playground/blob/master/reads-utils/guess-encoding.py

[ Main_Page ]