Fqsampler.pl

From BITS wiki
Jump to: navigation, search
#!/usr/bin/perl
# Joachim Jacob, 2011
# joachim.jacob@vib.be
 
use strict;
use warnings;
use POSIX;
 
my $in=$ARGV[0] || die print "Usage: pass the name of the fastq file to sample and the required sample size\n";
my $samplesize=$ARGV[1] || die print "Usage: pass the name of the fastq file to sample and the required sample size\n";
 
# initiation
my $sampled=0;
my $noflines=`wc -l $in | awk -F' ' {'print \$1'}`;  # count number of lines in input
#print "Number of lines: $noflines\n";
 
# We can draw a sample only if its at least 10 times smaller than the input size:
#print "*** Query check \n";
if ($noflines/10<$samplesize){ die print "The sample size must be at least ten times smaller than the size of the input.\n";}
my $nofEntries=ceil($noflines/4);
#print "Number of Entries: $nofEntries\n";
 
# Decide which lines to sample: create array with 1 (to sample) or 0 (not to sample).
#print "*** Create sampleArray\n";
my $count=0;
my @sampleArray;
for(my $i=0;$i<$nofEntries;$i++){
		$sampleArray[$i]=0;
}
 
for(my $i=rand($nofEntries);$count<$samplesize;$i=rand($nofEntries)){
	$i=ceil($i);
	unless($sampleArray[$i]==1){ $sampleArray[$i]=1; $count++;}
}
 
#print "*** Check sampleArray\n";
$count=0;
my $arrayPos=0;
foreach(@sampleArray){
	$arrayPos++;
	if($_==1){ 
		$count++; 
#		print "Set for sampling: $arrayPos\n";
	}
}
#print "Number of lines marked for sampling: $count\n";
 
# Open filehandle and write down selected entries
$count=0;
my $linecheck=0;
my $fourcount=0;
 
open(FASTQ,$in);
 
while(<FASTQ>){
	if ($sampleArray[$count]==1){
		$linecheck=1; 
#		print "***** linecheck set ! counter: $count ***********\n";
		print $_;
	}
#	if($linecheck==1){
#		print $_;
#		$fourcount++;
#		print "Counter to three: $fourcount\n";
#		if ($fourcount > 3){
#			$linecheck=0; $fourcount=0; $sampled++; 
#			print "Reset!\n";
#		}
#	}
	$count+=0.25;
}
close(FASTQ);