您的位置:首页 > 其它

从NCBI基因组数据中获得cds,pep和geneID对应表

2014-08-08 16:52 1286 查看
在做基因组相关分析时,我们常常需要从基因组中提取cds,并翻译成相应的pep序列。此脚本,以NCBI数据库中标准的基因组序列文件和对应的gff文件为输入文件,快速获得cds序列,pep序列,RNA,Protein和gene的对应关系表等相关文件。

A perl script which deals with ncbi raw data,and from which get cds ,pep and gene,mRNA and protein ID list.

使用方法如下:

perl $0 gff_filegenomes_fa_file file_prefix

#! /usr/bin/perl -w

=head1 #######################################

=head1 name
grep_cds_pep_from_ncbi_genomes_datas.pl

=head1 description
deal with ncbi raw data,and from which get cds ,pep and gene,mRNA and protein ID list.

=head1 example
perl  $0 ref_ConCri1.0_top_level.gff3.gz ccr_ref_ConCri1.0_chrUn.fa.gz  mole
perl  $0 gff_file genomes_fa_file  file_prefix

=head1 author
original from Xiangfeng Li,xflee0608@163.com
##2014-4-19/21##

=head1 #######################################
=cut

use strict;
die `pod2text $0` unless @ARGV==3;
my ($gff,$fa,$prefix)=@ARGV;

##deal gff file##
$gff=~/gz$/ ? (open GFF,"gzip -cd $gff|"||die):(open GFF,$gff||die);
my (%mrna,%cds,%pep);
while(<GFF>){
chomp;
next if(/^#/);
my @p=split/\t/,$_;
my @q=split/;/,$p[8];
my ($rna,$pep,$nt,$gene);
my $chr=$p[0];
if($p[2] eq "mRNA"){
($rna=$q[0])=~s/ID=//;
($nt=$q[1])=~s/Name=//;
($gene=$q[-3])=~s/gene=//;
$mrna{$chr}{$rna}{strand}=$p[6];
$cds{$rna}=[$gene,$nt];
}
if($p[2] eq "CDS"){
($pep=$q[1])=~s/Name=//;
($rna=$q[2])=~s/Parent=//;
push @{$mrna{$chr}{$rna}{nt}},[$p[3],$p[4]];
$pep{$rna}=$pep;
}
}
close GFF;

##get id list##
my %anno;
my $id_gene_cds_pep=$prefix."_id_gene_cds_pep.lst";
open ID, ">",$id_gene_cds_pep||die;
foreach my $i(sort keys %pep){
if($cds{$i}){
my $out=join "\t",$i,$cds{$i}[0],$cds{$i}[1],$pep{$i};
print ID $out,"\n";
($anno{$i}=$out)=~s/\t/\|/g;
}
}
warn "create file:$id_gene_cds_pep\n";
close ID;

##deal fa file##
my %max;
$fa=~/gz$/ ? (open FA,"gzip -cd $fa|"||die):(open FA,$fa||die);
my $raw_cds=$prefix."_raw_cds";
open CDS1,">$raw_cds"||die;
my ($start,$end);
$/=">";<FA>;$/="\n";
while(<FA>){
my $name=$1 if(/(\S+)/);
my $info=(split/\|/,$name)[-1];
$/=">";
my $seq=<FA>;
$/="\n";
$seq=~s/>|\n+//g;
my $scaf=$mrna{$info};
foreach my $k(sort keys %$scaf){
next if(! exists $scaf->{$k}{nt});
my @p=@{$scaf->{$k}{nt}};
my $strand=$$scaf{$k}{strand};
my $get;
@p=sort{$a->[0]<=>$b->[0]}@p;
my $loc1=$p[0][0];
my $loc2=$p[-1][1];
my ($get_len,$add,$out,$gene);
if(exists $anno{$k}){
$add=$anno{$k};
$gene=(split/\|/,$add)[1];
}else{next;}
foreach(@p){
($start,$end)=@$_[0,1];
$get.=uc(substr($seq,$start-1,$end-$start+1));
}
if($strand eq "+"){
$get_len=length$get;
$get=~s/([A-Z]{50})/$1\n/g;
chop($get) unless($get_len%50);
$out=">$add LOC=$info :$loc1:$loc2:+ length=$get_len\n$get\n";
push @{$max{$gene}},[$get_len,$out];
print CDS1 $out;
}
if($strand eq "-"){
$get=&reverse_complement($get);
$get_len=length$get;
$get=~s/([A-Z]{50})/$1\n/g;
chop($get) unless($get_len%50);
$out=">$add LOC=$info :$loc1:$loc2:- length=$get_len\n$get\n";
push @{$max{$gene}},[$get_len,$out];
print CDS1 $out;
}
}
}
warn "create file:$raw_cds\n";
close FA;
close CDS1;

##get max transcript##
my $filter_cds=$prefix."_filter_cds";
open CDS2,">$filter_cds"||die;
my @a;
foreach my $j(keys %max){
my @trans=@{$max{$j}};
@trans=sort{$a->[0] cmp $b->[0]}@trans;
push @a,$trans[-1][1];
}
my @a_new;
foreach(@a){
my $r=$1 if(/^>rna(\d+)/);
push @a_new,[$r,$_];
}
my @cds_sort=map{$_->[1]}
sort{$a->[0] <=> $b->[0]}@a_new;
print CDS2 $_ for@cds_sort;
close CDS2;
warn "create file:$filter_cds\n";

##get raw pep sequences##
my $raw_pep=$prefix."_raw_pep";
open PEP1,">",$raw_pep||die;
my @raw_pep=&cds2pep($raw_cds);
print PEP1 $_ for@raw_pep;
close PEP1;
warn "create file:$raw_pep\n";

##get filter pep sequences##
my $filter_pep=$prefix."_filter_pep";
open PEP2,">$filter_pep"||die;
my @filter_pep=&cds2pep($filter_cds);
print PEP2 $_ for@filter_pep;
close PEP2;
warn "create file:$filter_pep\n";

##add label for cds and pep of filter##
my $label=uc($prefix);
open IN1,$filter_cds||die;
my $filter_cds_label=$prefix."_filter_cds_label";
open OUT1,">",$filter_cds_label||die;
while(<IN1>){
chomp;
if(/^>/){
my @a=split/\|/,$_,2;
my $name=$a[0]."_$label";
print OUT1"$name |$a[1]\n";
}else{print OUT1 "$_\n";}
}
close IN1;
close OUT1;
warn "create file:$filter_cds_label\n";

open IN2,$filter_pep||die;
my $filter_pep_label=$prefix."_filter_pep_label";
open OUT2,">",$filter_pep_label||die;
while(<IN2>){
chomp;
if(/^>/){
my @a=split/\|/,$_,2;
my $name=$a[0]."_$label";
print OUT2 "$name |$a[1]\n";
}else{print OUT2 "$_\n";}
}
close IN2;
close OUT2;
warn "create file:$filter_pep_label\n";

##timing##
my $time=times;
my $time_out=sprintf "%.2f",$time/60;
print "##########\nElapsed Time :$time_out mins\n##########\n";

##subroutine##
sub reverse_complement{
my ($seq)=shift;
$seq=reverse$seq;
$seq=~tr/AaGgCcTt/TtCcGgAa/;
return $seq;
}

##subroutine##
sub cds2pep{
my $file=shift;
my %code = (
"standard" =>
{
'GCA' => 'A', 'GCC' => 'A', 'GCG' => 'A', 'GCT' => 'A',                               # Alanine
'TGC' => 'C', 'TGT' => 'C',                                                           # Cysteine
'GAC' => 'D', 'GAT' => 'D',                                                           # Aspartic Aci
'GAA' => 'E', 'GAG' => 'E',                                                           # Glutamic Aci
'TTC' => 'F', 'TTT' => 'F',                                                           # Phenylalanin
'GGA' => 'G', 'GGC' => 'G', 'GGG' => 'G', 'GGT' => 'G',                               # Glycine
'CAC' => 'H', 'CAT' => 'H',                                                           # Histidine
'ATA' => 'I', 'ATC' => 'I', 'ATT' => 'I',                                             # Isoleucine
'AAA' => 'K', 'AAG' => 'K',                                                           # Lysine
'CTA' => 'L', 'CTC' => 'L', 'CTG' => 'L', 'CTT' => 'L', 'TTA' => 'L', 'TTG' => 'L',   # Leucine
'ATG' => 'M',                                                                         # Methionine
'AAC' => 'N', 'AAT' => 'N',                                                           # Asparagine
'CCA' => 'P', 'CCC' => 'P', 'CCG' => 'P', 'CCT' => 'P',                               # Proline
'CAA' => 'Q', 'CAG' => 'Q',                                                           # Glutamine
'CGA' => 'R', 'CGC' => 'R', 'CGG' => 'R', 'CGT' => 'R', 'AGA' => 'R', 'AGG' => 'R',   # Arginine
'TCA' => 'S', 'TCC' => 'S', 'TCG' => 'S', 'TCT' => 'S', 'AGC' => 'S', 'AGT' => 'S',   # Serine
'ACA' => 'T', 'ACC' => 'T', 'ACG' => 'T', 'ACT' => 'T',                               # Threonine
'GTA' => 'V', 'GTC' => 'V', 'GTG' => 'V', 'GTT' => 'V',                               # Valine
'TGG' => 'W',                                                                         # Tryptophan
'TAC' => 'Y', 'TAT' => 'Y',                                                           # Tyrosine
'TAA' => 'U', 'TAG' => 'U', 'TGA' => 'U'                                              # Stop
}
## more translate table could be added here in future
## more translate table could be added here in future
## more translate table could be added here in future
);
open IN,$file||die;
$/=">";<IN>;$/="\n";
my @results_set;
while(<IN>){
my $info=$_;
my @a=split/\s+/,$info;
$/=">";my $seq=<IN>;$/="\n";
$seq=~s/\n|>//g;
my $len=length$seq;
my $info_out=join " ",@a[0..($#a-1)];
my ($pep_out,$triplet);
for(my $i=0;$i<$len;$i+=3){
$triplet=substr($seq,$i,3);
next if(length$triplet!=3);
if(exists $code{standard}{$triplet}){
$pep_out.=$code{standard}{$triplet};
}else{$pep_out.="X"}
}
$pep_out=~s/U$// if($pep_out=~/U$/);
my $pep_len=length$pep_out;
$pep_out=~s/([A-Z]{50})/$1\n/g;
chop($pep_out) unless($pep_len%50);
my $results= ">$info_out length=$pep_len\n$pep_out\n";
push @results_set,$results;
}
return @results_set;
}

__END__
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: