Species from blast.pl
From BITS wiki
#!usr/local/bin/perl # SpeciesFromBLAST.pl # (c) Joachim Jacob 2009 ################################################# # BLASTresultparser # Program description : # Takes the flat text output of BLASTx (!!) search, # searches which queries doesn't have a hit # and classifies the other queries, to have hits in animals, plants, prokaryotes, nematodes, etc. # ! cab take a long time to run ! # # INPUT: provide file with BLAST output as parameter # OUTPUT: - classification of the hits in Classification_***.txt # - species information of the hits in Species_of_hits_***.txt <--- # - list of queries without hits in Nohits_***.txt # - list of queries with hits in Nohits_***.txt # - Summary and progress to standard output # # Just give it a try! Enjoy! ######################################### $|=1; use strict; use warnings; use Bio::SeqIO; use Bio::Species; use POSIX; ########### PARAMETERS ############## my $inputfile = $ARGV[0]; # input file (BLASTx results from blastcl3 (flat text)) my $batchsize = 50; # number of sequences of which the genbank file is retrieved at once using efetch ##################################### # Nematodes are classified more into detail, the remainder species are classified to phylum or class level. ########## nematode classification ############### my %fln=('Zeldia punctata','1','Caenorhabditis','2','Pristionchus','3'); my %ppn=("Radopholus",0,"Globodera",0,"Heterodera",0,"Meloidogyne",0,"Pratylenchus",0,"Xiphinema",0,"Bursaphelenchus",0, "Fergusobia",0); my %apn=("Toxocara",0,"Litomosoides",0,"Ancylostoma",0,"Teladorsagia",0,"Ascaris",0,"Brugia",0,"Heterorhabditis",0, "Wuchereria",0,"Onchocerca",0,"Haemonchus",0,"Nippostrongylus",0,"Necator",0,"Ostertagia",0,"Trichinella",0,"Trichuris",0, "Dirofilaria",0,"Angiostrongylus",0,"Anisakis",0,"Trichostrongylus",0,"Strongyloides",0,"Parastrongyloides",0,"Schistosoma",0, "Steinernema",0,"Fergusobia",0,"Dictyocaulus",0); ############################################### print "SpeciesFromBLAST\n(c)Joachim Jacob\n=========================\n\n"; # check input if($inputfile eq ""){ die print "\n*** Please provide an inputfile ! ***\n\n";} my $species_output = "Species_of_hits_".$inputfile.".txt"; # fasta like file with genus names of hits my $datatable = "Classification_".$inputfile.".txt"; my $db="Protein"; unlink($species_output); unlink($datatable); # initiating temporary file ############################ my $temp_file = "Specfrobla.temp.txt"; open(TEMP,">$temp_file"); print TEMP ""; close(TEMP); my(%hitstore,$lijn,$id,%nohits,%hits,$hits,$querycount,@queries); # reading input ################################# print "Reading input... "; open(INPUT,"< $inputfile")||die print "Error: can not open * $inputfile *\n\n"; my @input=<INPUT>; my $hitcount = 0; my $nohitcount = 0; my $switch1 = 0; my $switch2 = 0; my $switch3 = 0; my $eswitch = 0; my $evalue = 0; foreach $lijn (@input){ #print $lijn,"\n"; chomp $lijn; if ($lijn =~/\s+Database: All non-redundant GenBank CDS/ && $switch3 == 1){ # end of hits: reset $switch1 = 0; $switch2 = 0; $switch3 = 0; $eswitch = 0; } if ($lijn =~/^\S+.+\s+\d{2,3}\s{3}(\S+)/ && $switch1 == 1 && $switch2 == 1 && $switch3 == 1){ #print "\t$lijn\n"; $hits++; if ($eswitch == 0 && $evalue == 0){$evalue = $1; $eswitch = 1;} if ($lijn =~/^\w{2,4}\|([^\|]+)\|/){ #print "\t$1\n"; $hits{$id}++; push @{ $hitstore{$id} }, $1; } } if ($lijn =~/Sequences producing significant alignments:/ && $switch1 == 1 && $switch2 == 0){ $switch2 = 1; $switch3 = 1; #print "$id - hits found\n"; push(@queries,$id); $hitcount++; } if ($lijn =~/ \*\*\*\*\* No hits found \*\*\*\*\*\*/ && $switch1 == 1 && $switch2 == 0){ # identification no hits $switch1 = 0; $switch2 = 1; #print "$id - no hits found\n"; $nohits{$id}++; $nohitcount++; } if ($lijn =~/^Query= (\S+)/ && $switch1 == 0){ # identification new query $switch1 = 1; $switch2 = 0; #print "$1\n"; $id=$1; if($id=~/gi\|(\d+)\|\S+/){ $id=$1;} #gi|51237540|gb|CO897750.1|CO897750 $evalue=0; $querycount++; } } print "Done.\n"; ###################### # efetching hits ###################### print "Retrieving taxonomy..."; my $bar_end=$querycount/100; my $bar_progress=0; chmod 0644,$datatable; open(DATATABLE,">>$datatable"); print DATATABLE "ID\tClass\tppn\tapn\tfln\tinvert\tvert\tfungi\tplant\totherEuk\tTotEuk\tProk\tTotalwithClass\tNoClass\tTotalChckd\tTotalHits\n"; my ($d,$result,$temp,@hitaccs,$arraylength,$i,$seq_obj,$seq_in,$genuscount,@noclassification,@other); my $query_string=""; my $species_result; my($species,$genus,@classification,$bi,$totalcheck); my(@ppn,@parnema,@nema,@invert,@animal,@euk,@fungus,@plant,@prok,@general,@hgt,@lowereuk); my$ppncount=0;my$parcount=0;my$nemacount=0;my$invertcount=0;my$animalcount=0;my$eukcount=0;my$lowereukcount=0; my$fungicount=0;my$plantcount=0;my$prokcount=0;my$gencount=0;my$hgtcount=0;my$othercount=0; my$progress_count=0;my $whilecount;my $checkcount; my $querysize; print "\nOrder of processing\n"; for my $keys (keys %hitstore){ # hash of arrays print "\t>$keys\n"; } print "\n\n"; chmod 0664,$species_output; open(RESULT,">>$species_output"); for my $keys (keys %hitstore){ # hash of arrays print RESULT ">$keys\n"; # $key = BLAST query #print "-> @{ $hitstore{$keys} }\n"; # progress bar ############################ if($bar_progress<$bar_end){$bar_progress++;$progress_count++;}else{$bar_progress=0; print "\n",ceil(10000*$progress_count/$hitcount)/100,"%...\n"; } my $fln=0;my$ppn=0;my$apn=0;my$vert=0;my$plant=0;my$invert=0;my$fungi=0;my$othereuk=0;my$prok=0;my$noclass=0; my $total_class=0; my $total=0; my $eukaryotes=0; # efetching ############## @hitaccs=@{ $hitstore{$keys} }; $arraylength=@hitaccs; $querysize=0; for ($i=1; $i<=$arraylength ; $i++){ $query_string=$query_string.", ".$hitaccs[$i-1]; $querysize++; if(($i)%$batchsize==0 || $i==$arraylength){ # passing to server in batches of hundred acc numbers $whilecount=0; # # if no data is received, retry efetching 5 time do{ efetch_query($query_string,$db,$temp_file); $whilecount++; $checkcount=0; if (-z $temp_file){}else{ $seq_in = Bio::SeqIO -> new('-format' => 'GenBank','-file' => $temp_file); while ( $seq_obj = $seq_in->next_seq() ) { $checkcount++;} } #print "\n\t*** efetched $keys \t Query size:$querysize ==> check: $checkcount ***\n"; }until( $whilecount>5 || $querysize == $checkcount); print "\t[$keys ; stringnr ",ceil($i/$querysize),"; size $querysize :$whilecount time(s) tried]\n"; $query_string=""; $querysize=0; # writing species information ################################ $seq_in = Bio::SeqIO -> new('-format' => 'GenBank','-file' => $temp_file); while ( $seq_obj = $seq_in->next_seq() ) { if($seq_obj->species()){ # using BIO::species for retrieving species info $species= $seq_obj->species() or die "... euhm, probeer een keer opnieuw..."; $genus = $species->genus(); # $bi = $species->binomial(); $bi is now "Homo sapiens" $bi = $species -> binomial(); @classification = $species->classification(); print RESULT "$bi, "; $genuscount++; # classifying queries based on hits #################################### if(@classification){ # if @classification is not empty if(grep /Nematoda/, @classification){ # only nematodes if (exists $fln{$genus}){ $fln++; } if (exists $ppn{$genus}){ $ppn++; } if (exists $apn{$genus}){ $apn++; } }elsif(grep /Chordata/, @classification){ $vert++; # only vertebrate }elsif(grep /Metazoa/, @classification){ $invert++; # no nematodes no vertebrates }elsif(grep /Viridiplantae/, @classification){$plant++; # only plants }elsif(grep /Fungi/, @classification){$fungi++; # fungi }elsif(grep /Eukaryota/, @classification){ $othereuk++; #print " -- OTHER EUKARYOTE\t";} }elsif(grep /Bacteria/, @classification){ $prok++; #print " -- PROKARYOTE\t";} }elsif(grep /Eubacteria/, @classification){ $prok++; #print " -- PROKARYOTE\t";} }elsif(grep /Archaea/, @classification){$prok++; #print " -- PROKARYOTE\t";} }else { $noclass++; #print " -- no classification\t"; } }else{$noclass++;} #foreach(@classification){print $_,"\t";} #print "\n"; } } open(TEMP,">$temp_file"); # emptying temp file for efetching print TEMP ""; close(TEMP); } } #print RESULT "$genuscount\n"; print RESULT "\n"; $genuscount=0; # grouping queries by hit classification. ############################################# my $class_switch=0; $eukaryotes=$fln+$apn+$ppn+$invert+$vert+$plant+$fungi+$othereuk; $total_class=$eukaryotes+$prok; $total=$total_class+$noclass; print DATATABLE "$keys\t"; if( $ppn>0 && $apn==0 && $fln==0 && $invert==0 && $vert==0 && $fungi==0 && $plant==0 && $othereuk==0 && $prok==0){ $ppncount++; $totalcheck++; push(@ppn,$keys); print DATATABLE "PPN\t"; $class_switch=1; } if ($fln==0 && $ppn>=0 && $apn>0 && $invert==0 && $vert==0 && $fungi==0 && $plant==0 && $othereuk==0 && $prok==0){ $parcount++; $totalcheck++; push(@parnema,$keys); print DATATABLE "PARNEMA\t"; $class_switch=1; } if ($fln>0 && $ppn>=0 && $apn>=0 && $invert==0 && $vert==0 && $fungi==0 && $plant==0 && $othereuk==0 && $prok==0){ $nemacount++; $totalcheck++; push(@nema,$keys); print DATATABLE "NEMA\t"; $class_switch=1; } if ($fln>0 && $ppn>=0 && $apn>=0 &&$invert>0 && $vert==0 && $fungi==0 && $plant==0 && $othereuk==0 && $prok==0){ $invertcount++; $totalcheck++; push(@invert,$keys); print DATATABLE "INVERT\t"; $class_switch=1; } if ($fln>0 && $ppn>=0 && $apn>=0 && $invert>=0 && $vert>0 && $fungi==0 && $plant==0 && $othereuk>=0 && $prok==0){ $animalcount++; $totalcheck++; push(@animal,$keys); print DATATABLE "ANIMAL\t"; $class_switch=1; } if ($fln>=0 && $ppn>=0 && $apn>=0 && $invert>=0 && $vert>0 && $fungi>=0 && $plant>0 && $othereuk>=0 && $prok==0){ $eukcount++; $totalcheck++; push(@euk,$keys); print DATATABLE "EUK\t"; $class_switch=1; } if ($fln==0 && $ppn==0 && $apn==0 && $invert==0 && $vert==0 && $fungi>0 && $plant>=0 && $prok==0){ $fungicount++; $totalcheck++; push(@fungus,$keys); print DATATABLE "FUNGUS/PLANT\t"; $class_switch=1; } if ($fln==0 && $ppn==0 && $apn==0 && $invert==0 && $vert==0 && $fungi==0 && $plant>0 && $othereuk==0 && $prok==0){ $plantcount++; $totalcheck++; push(@plant,$keys); print DATATABLE "PLANT\t"; $class_switch=1; } if ($eukaryotes==0 && $prok>0){ $prokcount++; $totalcheck++; push(@prok,$keys); print DATATABLE "PROK\t"; $class_switch=1; } if ($eukaryotes>0 && $prok>0){ $gencount++; $totalcheck++; push(@general,$keys); print DATATABLE "GENERAL\t"; $class_switch=1; } if ($fln==0 && $ppn==0 && $apn==0 && $invert==0 && $vert==0 && $fungi==0 && $plant==0 && $othereuk>0 && $prok==0){ $othercount++; $totalcheck++; push(@other,$keys); print DATATABLE "OTHER\t"; $class_switch=1; } if ( $fln>=0 && $ppn>=0 && $apn>=0 && $invert>0 && $vert==0 && $fungi>0 && $plant>0 && $othereuk>=0 && $prok==0){ $lowereukcount++; $totalcheck++; push(@lowereuk,$keys); print DATATABLE "LOWEREUK\t"; $class_switch=1; } if ($fln==0 && $ppn>0 && $apn==0 && $invert==0 && $vert==0 && $fungi==0 && $plant==0 && $othereuk==0 && $prok>0){ $hgtcount++; $totalcheck++; push(@hgt,$keys); print DATATABLE "-HGT\t"; $class_switch=1; } if($class_switch == 0){push(@noclassification,$keys); print "-\t";} print DATATABLE "$ppn\t$apn\t$fln\t$invert\t$vert\t$fungi\t$plant\t$othereuk\t$eukaryotes\t$prok\t$total_class\t$noclass\t$total\t$hits{$keys}\t"; print DATATABLE "\n"; } close(RESULT); close(DATATABLE); unlink("Specfrobla.temp.txt"); print "\n\tDone.\n\n"; # reporting ################### my $nohitfile = "Nohits_".$inputfile.".txt"; my $hitfile = "Hits_".$inputfile.".txt"; print "*** Results can be found in:\n\|\"$datatable\"\n\|\"$species_output\"\n\|\"$nohitfile\"\n\|\"$hitfile\"\n\n"; print "*** GENERAL OVERVIEW \n"; print "\|Total number of queries :$querycount\n"; print "\|Number of queries without hits :$nohitcount (",(ceil(10000*$nohitcount/$querycount))/100,"%)\n"; print "\|Number of queries with hits :$hitcount (",(ceil(10000*$hitcount/$querycount))/100,"%)\n\n"; print "*** Overview classification of queries based on hits of \"",$inputfile,"\" file\n\n"; print "PPN : ",$ppncount,"\n"; print "Par nema : ",$parcount,"\n"; print "Nema : ",$nemacount,"\n"; print "Other Invert.: ",$invertcount,"\n"; print "Animals : ",$animalcount++,"\n"; print "Plant : ",$plantcount,"\n"; print "Fungus/Plant : ",$fungicount,"\n"; print "Lower euk. : ",$lowereukcount," (all euks but vertebrates)\n"; print "Eukaryotes : ",$eukcount,"\n"; print "Prokaryotes : ",$prokcount,"\n"; print "General : ",$gencount,"\n"; print "Other : ",$othercount,"\n\n"; print "HGT candidate: ",$hgtcount,"\n"; print "Horizontal Gene Transfer with prokaryotes, candidates:\n"; foreach (@hgt){ print "\t$_\n"; } if( @hgt == 0){ print "\tNo HGT candidates\n";} my $noclasscount=(@noclassification); print "\nNot classified: ",$noclasscount," \n"; foreach (@noclassification){ print "\t$_\n"; } # storing list of queries with and without hits ########################################## open(NOHITOUT,">$nohitfile")||die print "\n Could not write NOHITOUT!!\n"; for my $key (keys %nohits){ print NOHITOUT "$key\n"; } close(NOHITOUT); open(HITOUT,">$hitfile")||die print "\n Could not write HITOUT!!\n"; for my $key (keys %hits){ print HITOUT "$key\n"; } close(HITOUT); print "\n\nProgram done.\n"; ############# #Subroutines# ############# sub efetch_query { use Bio::DB::Query::GenBank; use Bio::SeqIO; use Bio::DB::GenBank; my $ef_query_string = shift; my $ef_db = shift; my $file = shift; my $out = Bio::SeqIO->new(-file => ">$file", -format => 'genbank'); my $query = Bio::DB::Query::GenBank->new( -db => $ef_db, -query => $ef_query_string ); my $gb = new Bio::DB::GenBank; my $stream = $gb->get_Stream_by_query($query); while ( my $seq = $stream->next_seq ) { $out->write_seq($seq); } } sub usage { print <<USAGE and exit; $0 <options> --query -q ncbi query. E.g. 'tomato[orgn]' --database -d ncbi database. E.g. 'Nucleotide','Protein','PubMed' (default: Nucleotide) --output -o ouputfile (default: fetched.out) --input -i inputfile (no default) USAGE }