Fqsampler.pl
From BITS wiki
#!/usr/bin/perl # Joachim Jacob, 2011 # joachim.jacob@vib.be use strict; use warnings; use POSIX; my $in=$ARGV[0] || die print "Usage: pass the name of the fastq file to sample and the required sample size\n"; my $samplesize=$ARGV[1] || die print "Usage: pass the name of the fastq file to sample and the required sample size\n"; # initiation my $sampled=0; my $noflines=`wc -l $in | awk -F' ' {'print \$1'}`; # count number of lines in input #print "Number of lines: $noflines\n"; # We can draw a sample only if its at least 10 times smaller than the input size: #print "*** Query check \n"; if ($noflines/10<$samplesize){ die print "The sample size must be at least ten times smaller than the size of the input.\n";} my $nofEntries=ceil($noflines/4); #print "Number of Entries: $nofEntries\n"; # Decide which lines to sample: create array with 1 (to sample) or 0 (not to sample). #print "*** Create sampleArray\n"; my $count=0; my @sampleArray; for(my $i=0;$i<$nofEntries;$i++){ $sampleArray[$i]=0; } for(my $i=rand($nofEntries);$count<$samplesize;$i=rand($nofEntries)){ $i=ceil($i); unless($sampleArray[$i]==1){ $sampleArray[$i]=1; $count++;} } #print "*** Check sampleArray\n"; $count=0; my $arrayPos=0; foreach(@sampleArray){ $arrayPos++; if($_==1){ $count++; # print "Set for sampling: $arrayPos\n"; } } #print "Number of lines marked for sampling: $count\n"; # Open filehandle and write down selected entries $count=0; my $linecheck=0; my $fourcount=0; open(FASTQ,$in); while(<FASTQ>){ if ($sampleArray[$count]==1){ $linecheck=1; # print "***** linecheck set ! counter: $count ***********\n"; print $_; } # if($linecheck==1){ # print $_; # $fourcount++; # print "Counter to three: $fourcount\n"; # if ($fourcount > 3){ # $linecheck=0; $fourcount=0; $sampled++; # print "Reset!\n"; # } # } $count+=0.25; } close(FASTQ);