您的位置:首页 > 运维架构 > Linux

linux下perl处理文本---使用hash处理

2013-12-17 21:21 369 查看
# Usage: perl gather_family_protein_gene.pl -a name_of_animal -e ../Extract_result -n GATHERuse strict;use warnings;use Getopt::Long;use File::Basename;#The input options listmy ($animalName,$extractDir,$saveName,$help,$errorLog);GetOptions('help|h' => \$help,'a:s' => \$animalName,'e:s' => \$extractDir,'n:s' => \$saveName,'r:s' => \$errorLog);sub usage{print <<USAGEusage:#version:       perl $0 [options]#author:        Oshyn Song <dualyangsong\@gmail.com>#history:       2013-12-17#desc:          Gather the extract result to a file by speciesoptions:-h  --help:print the info-a  :input the animal species filename-e  :input the extract result directory-n  :the save result filaname-r	:the error log file name#perl $0USAGE}#Change the STDERR to errorlog fileif (!defined $errorLog){$errorLog = "errorlog";}if (! open (STDERR, ">> ${errorLog}")){die "Can not open errorlog $!";}#Test if given the necessary optionsif (defined $help || !(defined $animalName && defined $extractDir && defined $saveName)){&usage();exit 0;}print "Start process...\n";#Open the names of all animal species fileif (! open (ANIMALNAME,"< ${animalName}")){die "Can not open file of ${animalName} $!";}print "Open the file of animal species name successfully.\n\n";#Read the animal species name every lineforeach(<ANIMALNAME>){chomp;my $animal_name = $_;$animal_name = substr($animal_name,0,index($animal_name,"."));if (!opendir TFFLIST,"${extractDir}/${animal_name}" ){die "Can not open directory of ${extractDir}/${animal_name}. $!";}print "process ${animal_name}...\n";#Open the result fileif (! open OUT,">> ${extractDir}/${animal_name}/${saveName}"){die "Can not open ${extractDir}/${animal_name}/${saveName}. $!";}#Read every filename and open itmy $filename;my %gather;foreach $filename (readdir TFFLIST){next if $filename =~ /^\./;my $filepath = "${extractDir}/${animal_name}/$filename";next unless -f $filepath and -r $filepath;if (! open FILE, "${filepath}"){die "Can not open the file : ${filepath} $!";}my $line;while(defined ($line = <FILE>)){chomp($line);if ($line =~ /^(ENS[\w]+?[\d]{11})[\t]([0-9e\-\.]+)$/){my $protein = $1;my $evalue = $2;my $tfname = substr($filename,0,index($filename,"."));if (!exists $gather{$protein}){$gather{$protein} = "${tfname}=>${evalue}";}else{$gather{$protein} = "$gather{$protein}\t|\t${tfname}=>${evalue}";}}if ($line =~ /^(ENS[\w]+?[\d]{11})[\t](ENS[\w]+?[\d]{11})/){my $p = $1;my $gene = $2;if (exists $gather{$p}){unless (substr($gather{$p},0,3) eq "ENS"){$gather{$p} = "${gene}\t$gather{$p}";}}}}}close FILE;while(my($key,$value) = each %gather){print OUT "${key}=>${value}\n";}close OUT;print "${animal_name} process finished!\n\n";closedir TFFLIST;}close ANIMALNAME;
处理后的结果如下:
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: