perl应用:DNA序列翻译(下):从fasta格式中读取序列,然后输出蛋白质序列,以及fasta格式的介绍
2012-10-23 22:45
686 查看
use strict;
use warnings;
my $dna ='';
my $protein ='';
my @file_data=( );
my @filedata;
@filedata = get_file_data();
$dna = extract_sequence_from_fasta_data(@filedata);
$protein = dna2peptide($dna);
print_sequence($protein,25);
sub get_file_data
{
# A subroutine to get data from a file given its filename
#读取文件的子序列
my $dna_filename;
my @filedata;
print "please input the Path just like this f:\\\\perl\\\\data.txt\n";
chomp($dna_filename=<STDIN>);
open(DNAFILENAME,$dna_filename)||die("can not open the file!");
@filedata = <DNAFILENAME>;
close DNAFILENAME;
return @filedata;#子函数的返回值一定要记住写
}
sub extract_sequence_from_fasta_data
{
#*******************************************************************
# A subroutine to extract FASTA sequence data from an array
# 得到其中的序列
# fasta格式介绍:
# 包括三个部分
# 1.第一行中以>开头的注释行,后面是名称和序列的来源
# 2.标准单字母符号的序列
# 3.*表示结尾
#*******************************************************************
my (@fasta_file_data) =@_;
my $sequence =' ';
foreach my $line (@fasta_file_data)
{
#这里忽略空白行
if ($line=~/^\s*$/)
{
next;
}
#忽略注释行
elsif($line=~/^\s*#/)
{
next;
}
#忽略fasta的第一行
elsif($line=~/^>/)
{
next;
}
else
{
$sequence .=$line;
}
}
$sequence=~s/\s//g;
return $sequence;
}
sub print_sequence
{
# A subroutine to format and print sequence data
my ($sequence, $length) = @_;
for (my $pos =0; $pos<length($sequence);$pos+=$length)
{
print substr($sequence,$pos,$length),"\n";
}
}
sub codon2aa
{
#第三种方法
#也就是运用哈希
#我们将所有的密码子作为hash的key,然后将代表的氨基酸作为hash的value
#然后进行匹配
# codon2aa
# A subroutine to translate a DNA 3-character codon to an amino acid
# Version 3, using hash lookup
my($codon) = @_;
$codon = uc $codon;#uc=uppercase;lc=lowercase
#也就是大小写转换,uc表示将所有的小写 转换为大写
#lc将所有的大写转换为小写
my(%genetic_code) = (
'TCA' => 'S', # Serine
'TCC' => 'S', # Serine
'TCG' => 'S', # Serine
'TCT' => 'S', # Serine
'TTC' => 'F', # Phenylalanine
'TTT' => 'F', # Phenylalanine
'TTA' => 'L', # Leucine
'TTG' => 'L', # Leucine
'TAC' => 'Y', # Tyrosine
'TAT' => 'Y', # Tyrosine
'TAA' => '_', # Stop
'TAG' => '_', # Stop
'TGC' => 'C', # Cysteine
'TGT' => 'C', # Cysteine
'TGA' => '_', # Stop
'TGG' => 'W', # Tryptophan
'CTA' => 'L', # Leucine
'CTC' => 'L', # Leucine
'CTG' => 'L', # Leucine
'CTT' => 'L', # Leucine
'CCA' => 'P', # Proline
'CCC' => 'P', # Proline
'CCG' => 'P', # Proline
'CCT' => 'P', # Proline
'CAC' => 'H', # Histidine
'CAT' => 'H', # Histidine
'CAA' => 'Q', # Glutamine
'CAG' => 'Q', # Glutamine
'CGA' => 'R', # Arginine
'CGC' => 'R', # Arginine
'CGG' => 'R', # Arginine
'CGT' => 'R', # Arginine
'ATA' => 'I', # Isoleucine
'ATC' => 'I', # Isoleucine
'ATT' => 'I', # Isoleucine
'ATG' => 'M', # Methionine
'ACA' => 'T', # Threonine
'ACC' => 'T', # Threonine
'ACG' => 'T', # Threonine
'ACT' => 'T', # Threonine
'AAC' => 'N', # Asparagine
'AAT' => 'N', # Asparagine
'AAA' => 'K', # Lysine
'AAG' => 'K', # Lysine
'AGC' => 'S', # Serine
'AGT' => 'S', # Serine
'AGA' => 'R', # Arginine
'AGG' => 'R', # Arginine
'GTA' => 'V', # Valine
'GTC' => 'V', # Valine
'GTG' => 'V', # Valine
'GTT' => 'V', # Valine
'GCA' => 'A', # Alanine
'GCC' => 'A', # Alanine
'GCG' => 'A', # Alanine
'GCT' => 'A', # Alanine
'GAC' => 'D', # Aspartic Acid
'GAT' => 'D', # Aspartic Acid
'GAA' => 'E', # Glutamic Acid
'GAG' => 'E', # Glutamic Acid
'GGA' => 'G', # Glycine
'GGC' => 'G', # Glycine
'GGG' => 'G', # Glycine
'GGT' => 'G', # Glycine
);
if(exists $genetic_code{$codon})
{
return $genetic_code{$codon};
}
else
{
print STDERR "Bad codon \"$codon\"!!\n";
exit;
}
}
sub dna2peptide
{
my ($dna)=@_;
my $protein ='';
for (my $i=0; $i<(length($dna)-2);$i+=3)
{
$protein .=codon2aa(substr($dna,$i,3));
}
return $protein;#这个词错误找了一晚上,没有返回值,所以结果总是没有内容,以后要引以为戒,子程序一定要有返回值
}
结果如下:
use warnings;
my $dna ='';
my $protein ='';
my @file_data=( );
my @filedata;
@filedata = get_file_data();
$dna = extract_sequence_from_fasta_data(@filedata);
$protein = dna2peptide($dna);
print_sequence($protein,25);
sub get_file_data
{
# A subroutine to get data from a file given its filename
#读取文件的子序列
my $dna_filename;
my @filedata;
print "please input the Path just like this f:\\\\perl\\\\data.txt\n";
chomp($dna_filename=<STDIN>);
open(DNAFILENAME,$dna_filename)||die("can not open the file!");
@filedata = <DNAFILENAME>;
close DNAFILENAME;
return @filedata;#子函数的返回值一定要记住写
}
sub extract_sequence_from_fasta_data
{
#*******************************************************************
# A subroutine to extract FASTA sequence data from an array
# 得到其中的序列
# fasta格式介绍:
# 包括三个部分
# 1.第一行中以>开头的注释行,后面是名称和序列的来源
# 2.标准单字母符号的序列
# 3.*表示结尾
#*******************************************************************
my (@fasta_file_data) =@_;
my $sequence =' ';
foreach my $line (@fasta_file_data)
{
#这里忽略空白行
if ($line=~/^\s*$/)
{
next;
}
#忽略注释行
elsif($line=~/^\s*#/)
{
next;
}
#忽略fasta的第一行
elsif($line=~/^>/)
{
next;
}
else
{
$sequence .=$line;
}
}
$sequence=~s/\s//g;
return $sequence;
}
sub print_sequence
{
# A subroutine to format and print sequence data
my ($sequence, $length) = @_;
for (my $pos =0; $pos<length($sequence);$pos+=$length)
{
print substr($sequence,$pos,$length),"\n";
}
}
sub codon2aa
{
#第三种方法
#也就是运用哈希
#我们将所有的密码子作为hash的key,然后将代表的氨基酸作为hash的value
#然后进行匹配
# codon2aa
# A subroutine to translate a DNA 3-character codon to an amino acid
# Version 3, using hash lookup
my($codon) = @_;
$codon = uc $codon;#uc=uppercase;lc=lowercase
#也就是大小写转换,uc表示将所有的小写 转换为大写
#lc将所有的大写转换为小写
my(%genetic_code) = (
'TCA' => 'S', # Serine
'TCC' => 'S', # Serine
'TCG' => 'S', # Serine
'TCT' => 'S', # Serine
'TTC' => 'F', # Phenylalanine
'TTT' => 'F', # Phenylalanine
'TTA' => 'L', # Leucine
'TTG' => 'L', # Leucine
'TAC' => 'Y', # Tyrosine
'TAT' => 'Y', # Tyrosine
'TAA' => '_', # Stop
'TAG' => '_', # Stop
'TGC' => 'C', # Cysteine
'TGT' => 'C', # Cysteine
'TGA' => '_', # Stop
'TGG' => 'W', # Tryptophan
'CTA' => 'L', # Leucine
'CTC' => 'L', # Leucine
'CTG' => 'L', # Leucine
'CTT' => 'L', # Leucine
'CCA' => 'P', # Proline
'CCC' => 'P', # Proline
'CCG' => 'P', # Proline
'CCT' => 'P', # Proline
'CAC' => 'H', # Histidine
'CAT' => 'H', # Histidine
'CAA' => 'Q', # Glutamine
'CAG' => 'Q', # Glutamine
'CGA' => 'R', # Arginine
'CGC' => 'R', # Arginine
'CGG' => 'R', # Arginine
'CGT' => 'R', # Arginine
'ATA' => 'I', # Isoleucine
'ATC' => 'I', # Isoleucine
'ATT' => 'I', # Isoleucine
'ATG' => 'M', # Methionine
'ACA' => 'T', # Threonine
'ACC' => 'T', # Threonine
'ACG' => 'T', # Threonine
'ACT' => 'T', # Threonine
'AAC' => 'N', # Asparagine
'AAT' => 'N', # Asparagine
'AAA' => 'K', # Lysine
'AAG' => 'K', # Lysine
'AGC' => 'S', # Serine
'AGT' => 'S', # Serine
'AGA' => 'R', # Arginine
'AGG' => 'R', # Arginine
'GTA' => 'V', # Valine
'GTC' => 'V', # Valine
'GTG' => 'V', # Valine
'GTT' => 'V', # Valine
'GCA' => 'A', # Alanine
'GCC' => 'A', # Alanine
'GCG' => 'A', # Alanine
'GCT' => 'A', # Alanine
'GAC' => 'D', # Aspartic Acid
'GAT' => 'D', # Aspartic Acid
'GAA' => 'E', # Glutamic Acid
'GAG' => 'E', # Glutamic Acid
'GGA' => 'G', # Glycine
'GGC' => 'G', # Glycine
'GGG' => 'G', # Glycine
'GGT' => 'G', # Glycine
);
if(exists $genetic_code{$codon})
{
return $genetic_code{$codon};
}
else
{
print STDERR "Bad codon \"$codon\"!!\n";
exit;
}
}
sub dna2peptide
{
my ($dna)=@_;
my $protein ='';
for (my $i=0; $i<(length($dna)-2);$i+=3)
{
$protein .=codon2aa(substr($dna,$i,3));
}
return $protein;#这个词错误找了一晚上,没有返回值,所以结果总是没有内容,以后要引以为戒,子程序一定要有返回值
}
结果如下:
F:\>f:perl\a.pl RWRR_GVLGALGRPPTGLQRRRRMG PAQ_EYAAWEA_LEAEVVVGAFATA WDAAEWSVQVRGSLAGVVRECAGSG DMEGDGSDPEPPDAGEDSKSENGEN APIYCICRKPDINCFMIGCDNCNEW FHGDCIRITEKMAKAIREWYCRECR EKDPKLEIRYRHKKSRERDGNERDS SEPRDEGGGRKRPVPDPDLQRRAGS GTGVGAMLARGSASPHKSSPQPLVA TPSQHHQQQQQQIKRSARMCGECEA CRRTEDCGHCDFCRDMKKFGGPNKI RQKCRLRQCQLRARESYKYFPSSLS PVTPSESLPRPRRPLPTQQQPQPSQ KLGRIREDEGAVASSTVKEPPEATA TPEPLSDEDL F:\>
相关文章推荐
- perl从文件中读取数据,然后输出,附一个蛋白质序列的读取
- perl应用:DNA序列翻译为蛋白质的完整程序(中)
- perl,读取所需文件的路径,然后打开相应的文件,并对文件中的DNA序列进行计数,substr函数对长字符串的片段化处理功能
- perl:DNA序列翻译成氨基酸序列的若干方法,直接法,简并法,哈希法,以及perl中的uc和lc函数(上)
- perl应用:六框阅读翻译DNA序列
- perl应用:DNA序列酶切图谱的创建
- 利用BioPerl将DNA序列翻译成蛋白序列
- perl应用:DNA互补序列的获取
- perl应用:SNP的提取(2):从对比序列中找到SNP位点并输出 a.pl
- [置顶] HDFS入门笔记------架构以及应用介绍
- RabbitMQ的应用场景以及基本原理介绍
- Flume入门笔记------架构以及应用介绍
- RabbitMQ 应用场景以及基本原理介绍
- RabbitMQ的应用场景以及基本原理介绍
- RabbitMQ的应用场景以及基本原理介绍
- 在一个win Forms应用中嵌入以及播放WAV音频文件(翻译五)
- Memcached缓存系统的介绍、安装以及应用方法详解
- AppleWatch开发教程之Watch应用对象新增内容介绍以及编写运行代码
- 编写一段程序,从标准输入读取string对象的序列直到连续出现两个相同的单词或者所有单词都读完为止。使用while循环一次读取一个单词,当一个单词连续出现两次是使用break语句终止循环。输出连续重复出现的单词,或者输出一个消息说明没有人任何单词是重复出现的。