Species from blast.pl

From BITS wiki
Jump to: navigation, search
 
#!usr/local/bin/perl
# SpeciesFromBLAST.pl
# (c) Joachim Jacob 2009
 
#################################################
 
# BLASTresultparser
 
# Program description :
 
# Takes the flat text output of BLASTx (!!) search, 
 
# searches which queries doesn't have a hit 
 
# and classifies the other queries, to have hits in animals, plants, prokaryotes, nematodes, etc.
 
# ! cab take a long time to run !
 
#
 
# INPUT: provide file with BLAST output as parameter
 
# OUTPUT: 	- classification of the hits in Classification_***.txt
 
#		- species information of the hits in Species_of_hits_***.txt  <---
 
#		- list of queries without hits in Nohits_***.txt
 
#		- list of queries with hits in Nohits_***.txt
 
#		- Summary and progress to standard output
 
#
 
# Just give it a try! Enjoy!
 
#########################################
 
 
 
$|=1;
 
 
 
 
 
use strict;
 
use warnings;
 
use Bio::SeqIO;
 
use Bio::Species;
 
use POSIX;
 
 
 
###########  PARAMETERS  ##############
 
my $inputfile = $ARGV[0];	# input file (BLASTx results from blastcl3 (flat text))
 
my $batchsize = 50; 		# number of sequences of which the genbank file is retrieved at once using efetch
 
#####################################
 
 
 
# Nematodes are classified more into detail, the remainder species are classified to phylum or class level.
 
########## nematode classification ############### 
 
my %fln=('Zeldia punctata','1','Caenorhabditis','2','Pristionchus','3');
 
my %ppn=("Radopholus",0,"Globodera",0,"Heterodera",0,"Meloidogyne",0,"Pratylenchus",0,"Xiphinema",0,"Bursaphelenchus",0,
 
"Fergusobia",0);
 
my %apn=("Toxocara",0,"Litomosoides",0,"Ancylostoma",0,"Teladorsagia",0,"Ascaris",0,"Brugia",0,"Heterorhabditis",0,
 
"Wuchereria",0,"Onchocerca",0,"Haemonchus",0,"Nippostrongylus",0,"Necator",0,"Ostertagia",0,"Trichinella",0,"Trichuris",0,
 
"Dirofilaria",0,"Angiostrongylus",0,"Anisakis",0,"Trichostrongylus",0,"Strongyloides",0,"Parastrongyloides",0,"Schistosoma",0,
 
"Steinernema",0,"Fergusobia",0,"Dictyocaulus",0);
 
###############################################
 
 
 
print "SpeciesFromBLAST\n(c)Joachim Jacob\n=========================\n\n";
 
 
 
# check input
 
if($inputfile eq ""){ die print "\n*** Please provide an inputfile ! ***\n\n";}
 
 
 
my $species_output = "Species_of_hits_".$inputfile.".txt";		# fasta like file with genus names of hits
 
my $datatable = "Classification_".$inputfile.".txt";
 
my $db="Protein";
 
unlink($species_output);
 
unlink($datatable);
 
 
 
# initiating temporary file
 
############################
 
my $temp_file = "Specfrobla.temp.txt";
 
open(TEMP,">$temp_file");
 
print TEMP "";
 
close(TEMP);
 
 
 
my(%hitstore,$lijn,$id,%nohits,%hits,$hits,$querycount,@queries);
 
 
 
# reading input
 
#################################
 
 
 
 
 
 
 
print "Reading input... ";
 
open(INPUT,"< $inputfile")||die print "Error: can not open * $inputfile *\n\n";
 
my @input=<INPUT>;
 
 
 
my $hitcount = 0;
 
my $nohitcount = 0;
 
my $switch1 = 0;
 
my $switch2 = 0;
 
my $switch3 = 0;
 
my $eswitch = 0;
 
my $evalue = 0;
 
 
 
 
 
foreach $lijn (@input){
 
	#print $lijn,"\n";
 
	chomp $lijn;
 
 
 
	if ($lijn =~/\s+Database: All non-redundant GenBank CDS/ && $switch3 == 1){		# end of hits: reset
 
		$switch1 = 0;
 
		$switch2 = 0;
 
		$switch3 = 0;
 
		$eswitch = 0;
 
 
 
	}	
 
 
 
	if ($lijn =~/^\S+.+\s+\d{2,3}\s{3}(\S+)/ && $switch1 == 1 && $switch2 == 1 && $switch3 == 1){
 
		#print "\t$lijn\n";
 
		$hits++;
 
		if ($eswitch == 0 && $evalue == 0){$evalue = $1; $eswitch = 1;}
 
		if ($lijn =~/^\w{2,4}\|([^\|]+)\|/){
 
			#print "\t$1\n";
 
			$hits{$id}++;
 
			push @{ $hitstore{$id} }, $1;
 
		}
 
	}
 
 
 
 
 
	if ($lijn =~/Sequences producing significant alignments:/ && $switch1 == 1 && $switch2 == 0){
 
		$switch2 = 1;
 
		$switch3 = 1;
 
		#print "$id - hits found\n";
 
		push(@queries,$id);
 
		$hitcount++;
 
	}
 
 
 
	if ($lijn =~/ \*\*\*\*\* No hits found \*\*\*\*\*\*/ && $switch1 == 1 && $switch2 == 0){		# identification no hits
 
		$switch1 = 0;
 
		$switch2 = 1;
 
		#print "$id - no hits found\n";
 
		$nohits{$id}++;
 
		$nohitcount++;
 
	}
 
 
 
	if ($lijn =~/^Query= (\S+)/ && $switch1 == 0){		# identification new query
 
		$switch1 = 1;
 
		$switch2 = 0;
 
		#print "$1\n";
 
		$id=$1;
 
		if($id=~/gi\|(\d+)\|\S+/){ $id=$1;}			#gi|51237540|gb|CO897750.1|CO897750
 
 
 
		$evalue=0;
 
		$querycount++;
 
 
 
 
 
	}
 
}
 
 
 
print "Done.\n";
 
######################
 
#   efetching hits
 
######################
 
print "Retrieving taxonomy...";
 
my $bar_end=$querycount/100;
 
my $bar_progress=0;
 
chmod 0644,$datatable;
 
open(DATATABLE,">>$datatable");
 
print DATATABLE "ID\tClass\tppn\tapn\tfln\tinvert\tvert\tfungi\tplant\totherEuk\tTotEuk\tProk\tTotalwithClass\tNoClass\tTotalChckd\tTotalHits\n";
 
 
 
my ($d,$result,$temp,@hitaccs,$arraylength,$i,$seq_obj,$seq_in,$genuscount,@noclassification,@other);		
 
my $query_string="";
 
my $species_result;
 
my($species,$genus,@classification,$bi,$totalcheck);
 
my(@ppn,@parnema,@nema,@invert,@animal,@euk,@fungus,@plant,@prok,@general,@hgt,@lowereuk);
 
my$ppncount=0;my$parcount=0;my$nemacount=0;my$invertcount=0;my$animalcount=0;my$eukcount=0;my$lowereukcount=0;
 
my$fungicount=0;my$plantcount=0;my$prokcount=0;my$gencount=0;my$hgtcount=0;my$othercount=0;
 
my$progress_count=0;my $whilecount;my $checkcount; my $querysize;
 
 
 
print "\nOrder of processing\n";
 
for my $keys (keys %hitstore){					# hash of arrays	
 
	print "\t>$keys\n";
 
}
 
print "\n\n";
 
 
 
chmod 0664,$species_output;
 
open(RESULT,">>$species_output");
 
 
 
for my $keys (keys %hitstore){					# hash of arrays	
 
	print RESULT ">$keys\n";					# $key = BLAST query
 
	#print "-> @{ $hitstore{$keys} }\n"; 
 
 
 
	# progress bar
 
	############################
 
	if($bar_progress<$bar_end){$bar_progress++;$progress_count++;}else{$bar_progress=0;
 
		print "\n",ceil(10000*$progress_count/$hitcount)/100,"%...\n";
 
	}
 
 
 
	my $fln=0;my$ppn=0;my$apn=0;my$vert=0;my$plant=0;my$invert=0;my$fungi=0;my$othereuk=0;my$prok=0;my$noclass=0;
 
	my $total_class=0;
 
	my $total=0;
 
	my $eukaryotes=0;
 
 
 
	# efetching
 
	##############
 
 
 
	@hitaccs=@{ $hitstore{$keys} };
 
	$arraylength=@hitaccs;
 
	$querysize=0;
 
	for ($i=1; $i<=$arraylength ; $i++){
 
		$query_string=$query_string.", ".$hitaccs[$i-1];
 
		$querysize++;
 
		if(($i)%$batchsize==0 || $i==$arraylength){			# passing to server in batches of hundred acc numbers
 
 
 
			$whilecount=0;
 
 
 
#			# if no data is received, retry efetching 5 time
 
			do{	
 
				efetch_query($query_string,$db,$temp_file);
 
				$whilecount++;
 
				$checkcount=0;
 
				if (-z $temp_file){}else{
 
					$seq_in = Bio::SeqIO -> new('-format' => 'GenBank','-file' => $temp_file);
 
					while ( $seq_obj = $seq_in->next_seq() ) { $checkcount++;}
 
 
 
				}
 
				#print "\n\t*** efetched $keys \t Query size:$querysize ==> check: $checkcount ***\n";
 
			}until( $whilecount>5 || $querysize == $checkcount);
 
 
 
			print "\t[$keys ; stringnr ",ceil($i/$querysize),"; size $querysize :$whilecount time(s) tried]\n";
 
			$query_string="";
 
			$querysize=0;
 
 
 
 
 
	# writing species information
 
	################################
 
 
 
 
 
			$seq_in = Bio::SeqIO -> new('-format' => 'GenBank','-file' => $temp_file);
 
 
 
			while ( $seq_obj = $seq_in->next_seq() ) {
 
 
 
 
 
				if($seq_obj->species()){			# using BIO::species for retrieving species info
 
					$species= $seq_obj->species() or die "... euhm, probeer een keer opnieuw...";
 
					$genus = $species->genus();	#   $bi = $species->binomial();   $bi is now "Homo sapiens"
 
					$bi = $species -> binomial();
 
					@classification = $species->classification();
 
					print RESULT "$bi, ";
 
 
 
					$genuscount++;
 
 
 
	# classifying queries based on hits
 
	####################################					
 
 
 
					if(@classification){				# if @classification is not empty
 
						if(grep /Nematoda/, @classification){			# only nematodes
 
							if (exists $fln{$genus}){ $fln++; }
 
							if (exists $ppn{$genus}){ $ppn++; }
 
							if (exists $apn{$genus}){ $apn++; }						
 
						}elsif(grep /Chordata/, @classification){ $vert++;   	# only vertebrate
 
						}elsif(grep /Metazoa/, @classification){ $invert++; 	# no nematodes no vertebrates
 
						}elsif(grep /Viridiplantae/, @classification){$plant++; # only plants
 
						}elsif(grep /Fungi/, @classification){$fungi++; 	# fungi
 
						}elsif(grep /Eukaryota/, @classification){ $othereuk++; 	#print " -- OTHER EUKARYOTE\t";}
 
						}elsif(grep /Bacteria/, @classification){ $prok++; 	#print " -- PROKARYOTE\t";} 
 
						}elsif(grep /Eubacteria/, @classification){ $prok++; 	#print " -- PROKARYOTE\t";} 
 
						}elsif(grep /Archaea/, @classification){$prok++;	#print " -- PROKARYOTE\t";}
 
						}else { $noclass++; 	#print  " -- no classification\t";
 
						}
 
					}else{$noclass++;}				
 
					#foreach(@classification){print $_,"\t";}
 
					#print "\n";					
 
 
 
				}
 
			}
 
			open(TEMP,">$temp_file");		# emptying temp file for efetching
 
				print TEMP "";
 
			close(TEMP);
 
		}
 
	}
 
	#print RESULT "$genuscount\n";
 
 
 
	print RESULT "\n";
 
	$genuscount=0;
 
 
 
 
 
	# grouping queries by hit classification.
 
	#############################################
 
 
 
	my $class_switch=0;
 
 
 
	$eukaryotes=$fln+$apn+$ppn+$invert+$vert+$plant+$fungi+$othereuk;
 
	$total_class=$eukaryotes+$prok;
 
	$total=$total_class+$noclass;
 
	print DATATABLE "$keys\t";
 
	if( $ppn>0 && $apn==0 && $fln==0 && $invert==0 && $vert==0 && $fungi==0 && $plant==0 && $othereuk==0 && $prok==0){
 
		$ppncount++; $totalcheck++;
 
		push(@ppn,$keys); 
 
		print DATATABLE "PPN\t";
 
		$class_switch=1;
 
	}
 
	if ($fln==0 && $ppn>=0 && $apn>0 && $invert==0 && $vert==0 && $fungi==0 && $plant==0 && $othereuk==0 && $prok==0){
 
		$parcount++; $totalcheck++;
 
		push(@parnema,$keys); 
 
		print DATATABLE "PARNEMA\t";
 
		$class_switch=1;
 
	}
 
	if ($fln>0 && $ppn>=0 && $apn>=0 && $invert==0 && $vert==0 && $fungi==0 && $plant==0 && $othereuk==0 && $prok==0){
 
		$nemacount++; $totalcheck++;
 
		push(@nema,$keys); 
 
		print DATATABLE "NEMA\t";
 
		$class_switch=1;
 
	}
 
	if ($fln>0 && $ppn>=0 && $apn>=0 &&$invert>0 && $vert==0 && $fungi==0 && $plant==0 && $othereuk==0 && $prok==0){
 
		$invertcount++; $totalcheck++;
 
		push(@invert,$keys); 
 
		print DATATABLE "INVERT\t";
 
		$class_switch=1;
 
	}
 
	if ($fln>0 && $ppn>=0 && $apn>=0 && $invert>=0 && $vert>0 && $fungi==0 && $plant==0 && $othereuk>=0 && $prok==0){
 
		$animalcount++; $totalcheck++;
 
		push(@animal,$keys); 
 
		print DATATABLE "ANIMAL\t";
 
		$class_switch=1;
 
	}
 
	if ($fln>=0 && $ppn>=0 && $apn>=0 && $invert>=0 && $vert>0 && $fungi>=0 && $plant>0 && $othereuk>=0 && $prok==0){
 
		$eukcount++; $totalcheck++;
 
		push(@euk,$keys); 
 
		print DATATABLE "EUK\t";
 
		$class_switch=1;
 
	}
 
	if ($fln==0 && $ppn==0 && $apn==0 && $invert==0 && $vert==0 && $fungi>0 && $plant>=0  && $prok==0){
 
		$fungicount++; $totalcheck++;
 
		push(@fungus,$keys); 
 
		print DATATABLE "FUNGUS/PLANT\t";
 
		$class_switch=1;
 
	}
 
	if ($fln==0 && $ppn==0 && $apn==0 && $invert==0 && $vert==0 && $fungi==0 && $plant>0 && $othereuk==0 && $prok==0){
 
		$plantcount++; $totalcheck++;
 
		push(@plant,$keys); 
 
		print DATATABLE "PLANT\t";
 
		$class_switch=1;
 
	}
 
	if ($eukaryotes==0 && $prok>0){
 
		$prokcount++; $totalcheck++;
 
		push(@prok,$keys); 
 
		print DATATABLE "PROK\t";
 
		$class_switch=1;
 
	}
 
	if ($eukaryotes>0 && $prok>0){
 
		$gencount++; $totalcheck++;
 
		push(@general,$keys); 
 
		print DATATABLE "GENERAL\t";
 
		$class_switch=1;
 
	}
 
	if ($fln==0 && $ppn==0 && $apn==0 && $invert==0 && $vert==0 && $fungi==0 && $plant==0 && $othereuk>0 && $prok==0){
 
		$othercount++; $totalcheck++;
 
		push(@other,$keys);
 
		print DATATABLE "OTHER\t";
 
		$class_switch=1;
 
	}
 
	if ( $fln>=0 && $ppn>=0 && $apn>=0 && $invert>0 && $vert==0 && $fungi>0 && $plant>0 && $othereuk>=0 && $prok==0){
 
		$lowereukcount++; $totalcheck++;
 
		push(@lowereuk,$keys);
 
		print DATATABLE "LOWEREUK\t";
 
		$class_switch=1;
 
	}
 
	if ($fln==0 && $ppn>0 && $apn==0 && $invert==0 && $vert==0 && $fungi==0 && $plant==0 && $othereuk==0 && $prok>0){
 
		$hgtcount++; $totalcheck++;
 
		push(@hgt,$keys);
 
		print DATATABLE "-HGT\t";
 
		$class_switch=1;
 
	}
 
	if($class_switch == 0){push(@noclassification,$keys); print "-\t";}
 
	print DATATABLE "$ppn\t$apn\t$fln\t$invert\t$vert\t$fungi\t$plant\t$othereuk\t$eukaryotes\t$prok\t$total_class\t$noclass\t$total\t$hits{$keys}\t"; 
 
 
 
	print DATATABLE "\n";
 
 
 
}	
 
close(RESULT);
 
close(DATATABLE);
 
unlink("Specfrobla.temp.txt");
 
print "\n\tDone.\n\n";
 
 
 
 
 
# reporting
 
###################
 
 
 
my $nohitfile = "Nohits_".$inputfile.".txt";
 
my $hitfile = "Hits_".$inputfile.".txt";
 
 
 
 
 
print "*** Results can be found in:\n\|\"$datatable\"\n\|\"$species_output\"\n\|\"$nohitfile\"\n\|\"$hitfile\"\n\n";
 
print "*** GENERAL OVERVIEW \n";
 
print "\|Total number of queries        :$querycount\n";
 
print "\|Number of queries without hits :$nohitcount (",(ceil(10000*$nohitcount/$querycount))/100,"%)\n";
 
print "\|Number of queries with hits    :$hitcount (",(ceil(10000*$hitcount/$querycount))/100,"%)\n\n";
 
print "*** Overview classification of queries based on hits of \"",$inputfile,"\" file\n\n";
 
 
 
print "PPN          : ",$ppncount,"\n";
 
print "Par nema     : ",$parcount,"\n";
 
print "Nema         : ",$nemacount,"\n";
 
print "Other Invert.: ",$invertcount,"\n";
 
print "Animals      : ",$animalcount++,"\n";
 
print "Plant        : ",$plantcount,"\n";
 
print "Fungus/Plant : ",$fungicount,"\n";
 
print "Lower euk.   : ",$lowereukcount," (all euks but vertebrates)\n";
 
print "Eukaryotes   : ",$eukcount,"\n";
 
print "Prokaryotes  : ",$prokcount,"\n";
 
print "General	     : ",$gencount,"\n";
 
print "Other        : ",$othercount,"\n\n";
 
 
 
print "HGT candidate: ",$hgtcount,"\n";
 
print "Horizontal Gene Transfer with prokaryotes, candidates:\n";
 
foreach (@hgt){
 
	print "\t$_\n";
 
}
 
if( @hgt == 0){ print "\tNo HGT candidates\n";}
 
my $noclasscount=(@noclassification);
 
print "\nNot classified: ",$noclasscount," \n";
 
foreach (@noclassification){
 
	print "\t$_\n";
 
}
 
 
 
# storing list of queries with and without hits
 
##########################################
 
open(NOHITOUT,">$nohitfile")||die print "\n Could not write NOHITOUT!!\n";
 
for my $key (keys %nohits){
 
	print NOHITOUT "$key\n";
 
 
 
}
 
close(NOHITOUT);
 
open(HITOUT,">$hitfile")||die print "\n Could not write HITOUT!!\n";
 
for my $key (keys %hits){
 
	print HITOUT "$key\n";
 
 
 
}
 
close(HITOUT);
 
 
 
print "\n\nProgram done.\n";
 
 
 
 
 
 
 
 
 
#############
 
#Subroutines#
 
#############
 
 
 
sub efetch_query {
 
 
 
	use Bio::DB::Query::GenBank;
 
	use Bio::SeqIO;
 
	use Bio::DB::GenBank;
 
 
 
	my $ef_query_string = shift;
 
	my $ef_db    = shift;
 
	my $file = shift;	
 
	my $out = Bio::SeqIO->new(-file => ">$file", -format => 'genbank');
 
 
 
	my $query = Bio::DB::Query::GenBank->new( -db    => $ef_db,
 
						  -query => $ef_query_string );
 
	my $gb     = new Bio::DB::GenBank;
 
 
 
	my $stream = $gb->get_Stream_by_query($query);
 
	while ( my $seq = $stream->next_seq ) {
 
	    $out->write_seq($seq); 
 
	}
 
}
 
 
 
 
 
sub usage {
 
 
 
  print <<USAGE and exit;
 
$0 <options>
 
            --query -q
 
                        ncbi query. E.g. 'tomato[orgn]'
 
            --database -d
 
                        ncbi database. E.g. 'Nucleotide','Protein','PubMed' (default: Nucleotide)
 
            --output -o
 
                        ouputfile (default: fetched.out)
 
            --input -i
 
			inputfile (no default)
 
 
 
USAGE
 
}