您的位置:首页 > 其它

Adam学习13之Fasta/Fastq/SAM/BAM文件格式数据读取

2016-04-30 22:33 405 查看
0.代码(读取方法):

package org.bdgenomics.adamLocal.algorithms.test

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.bdgenomics.adam.rdd.ADAMContext
import htsjdk.samtools.ValidationStringency
//import scala.collection.parallel.Foreach

object hs38DHL1Test1 {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("readFileFromFaFq").setMaster("local")
val sc = new SparkContext(conf)

val ac = new ADAMContext(sc)
val fileFa = "file/adam/hs38DH/hs38DHL1/hs38DHL1.fa"
// val fileFq = "file/adam/hs38DH/hs38DHSE1L100F1.fq"
val fileFq = "file/adam/hs38DH/hs38DHL1/hs38DHL1F1Len10.fq"
val fileSam = "file/adam/hs38DH/hs38DHL1/hs38DHL1F1Len10.sam"

val fq0 = sc.textFile(fileFa)
println("fq0.count:" + fq0.count);

//load Fasta
println(fileFa + ":");
val fa1 = ac.loadFasta(fileFa, 10000)
var fa1Segquence = fa1.map(_.getFragmentSequence()).collect
// fq1Segquence.foreach(println)
println("fa1Segquence.length:" + fa1Segquence.length);
for (i <- 0 until fa1Segquence.length) {
println(fa1Segquence(i));
}
println();
fa1.foreach(println)

//load Fastq
println();
println(fileFq + ":");
val fq1 = ac.loadFastq(fileFq, None, None, ValidationStringency.STRICT)
var fq1Segquence = fq1.map(_.getSequence()).collect
// fq1Segquence.foreach(println)
println("fq1Segquence.length:" + fq1Segquence.length);
for (i <- 0 until fq1Segquence.length) {
println(fq1Segquence(i));
}
println();
fq1.foreach(println)

println("SAM:");
val sam1=ac.loadAlignments(fileSam)
sam1.foreach(println)
sc.stop

}
}

1.Fasta:ac.loadFasta(fileFa, 10000)
源文件:

>chrUn_KN707606v1_decoy AC:KN707606.1 gi:734691250 LN:2200 rl:unplaced M5:20c768ac79ca38077e5012ee0e5f8333 AS:hs38d1
ctagtagctgggactacaagcgcccgccaccacacccggctaatttttttgtatttttagtggagacaggtttcaccgtg
ttggccaggatggtctcgatctcctgaccttgtgatctgcccaccttgccctcccaaagtgctgggattacaggcatgag
ccaccatacccggcagTGTCCTATCCATTTTTAAGGCAGCCACTTGGAGTTGGAGCATGTCTTTCTCTCATAATCTCTTA
CCAGATGTCTCAGAGCAGCCTGTGCACTTTAACTCCAGACATTCTGCCACTGAGCCCCCTAGAGCTCCAGCTTTTAAAGC
ACTTGGGGTGAGCCTCGAGAGATGACAGACGGAGCTGCCCAAGAGCTGCCAGCTGCCAACCCTGCCTGGGGCTTCACGGC
CCGCGCCCTACTTCCTCTCAGCTGGCTCCACACCCTGGGGCGTGTAATTTCCAAATTCTCACTCCCAGGGCTAATTTGGG
GGATAAGACATTTGATTAGAAGTATCAgaaaccagctgggcatggtggctcacacctgtaatcccagcactttgggaggt
tatgactagaggatcatttgaactcaggaattcaagaccagcctggataacagtgagaccccatctctacaaaatataaa
caattatgtgagcatggtggtgcacacctgtagtccctgttccttgggaggctgaggccggaggatcccttgagcccagg
agttcaaggctgcagagagctgcgattgtgccactgcacactaacctgggagatagagcaagaacttgtctcagaaaaaa
aaagtatcaggaaCTAATCTCCAGTCCTATCAAGTTAGGCATAAGGTCAATGTGTGATAGCTGAGTGTCACAGAAACCAA
GGACAGGAATGCAACTGCCACTGGGGATGAACTGGAAGTGGGGAGTTAAACCACCTCAGAATGTccccatttttgtttct
tctccagATGTGCTGCTTTGCTTTTCCGTATGTTTCTCTACGGACCAGCTACCTCTCCTCTGCCAACAGATCCAAGTTGT
GCATGTTATGGGTCCAAACACCACGTGACAAGCCCATTCTTCCAGTTTCTCAGACCAGAAACTGCACTGTCCTCTAACTG
CTTCTTCTCCCTCTTGCATCTGGTCCTTGGGGAAATCCTGTTTGCCCGGCCTTCAGCATATATCCACAGTTTAACCTTAA
CCACTCCTCGCCACCACTCGCGGGGGCGAGCAGCCTTCGCCCCCTGCCTAGATTACTACAGTAACTTCATTGTTCTTTCT
ACTTCTCTCTTTGCCCCTCTGCTATCTCAAAACAGCATCCAAAATGCACCTAGCAAGAGCATGTCATTCCTCTGCACAAA
ACTCTccaacttctctctttttttttttttttttttttttgagacggagtctcactctgtcacccaggctggagtgcaat
agtgtgatcttggctcactgcaacctccacctcccagattcaagcgattctcctgcctcagcctcctgagtagctgagat
tacaggttcatgtcaccatgcccggctaatttttgtatttttagtagagacagggtttcaccatgttagtcaggctggtc
tcgaactcctgaccttgtgatccacccgcctcagcctcccaaagtgctgggattataggcatgagccaccgtgcatgacC
AACTTCTCTTTTTGTTCAGAGTAAAAGCCAACGGCCCATGAGGCTTTCCATGGTCACGCCTCCGCTCATTCGCTCTGTGG
CTTTGTCTTACACGGGTTCACTCCTCACTGGCCGCCTTGCTGACCCCATAGCTCACGGGCCTTACTCTGCTctcggggcc
tttgcacttgctccaCTGCAAATGCTCCTCCCCCAGAGGCCTTTGTGGCCCATTCCCTCGGTTCCTTAGGAACAATCCCT
TCCCTGGTCAAACCTCCACTGACATCTGTCTCCTtcccttctgaattttttttctccgGTAGTATTTATCACTCTGCTAT
CCTTAGGATTTCCTTATCTTGTTTATCATCATCTCCTCATCCAGAGcttaagtcctttttttttttttgagatagagtct
cgctctgtcgcccaggctggagtgcagtggcgcgatctcgtctcgctgaaagctccacctcccgggttcacgccattctc
ccgcctcagcctcccgagtagctgggactacaggcactcg


读取结果:
{"contig": {"contigName": "chrUn_KN707606v1_decoy", "contigLength": 2200, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": null}, "description": "AC:KN707606.1 gi:734691250 LN:2200 rl:unplaced M5:20c768ac79ca38077e5012ee0e5f8333 AS:hs38d1", "fragmentSequence": "ctagtagctgggactacaagcgcccgccaccacacccggctaatttttttgtatttttagtggagacaggtttcaccgtgttggccaggatggtctcgatctcctgaccttgtgatctgcccaccttgccctcccaaagtgctgggattacaggcatgagccaccatacccggcagTGTCCTATCCATTTTTAAGGCAGCCACTTGGAGTTGGAGCATGTCTTTCTCTCATAATCTCTTACCAGATGTCTCAGAGCAGCCTGTGCACTTTAACTCCAGACATTCTGCCACTGAGCCCCCTAGAGCTCCAGCTTTTAAAGCACTTGGGGTGAGCCTCGAGAGATGACAGACGGAGCTGCCCAAGAGCTGCCAGCTGCCAACCCTGCCTGGGGCTTCACGGCCCGCGCCCTACTTCCTCTCAGCTGGCTCCACACCCTGGGGCGTGTAATTTCCAAATTCTCACTCCCAGGGCTAATTTGGGGGATAAGACATTTGATTAGAAGTATCAgaaaccagctgggcatggtggctcacacctgtaatcccagcactttgggaggttatgactagaggatcatttgaactcaggaattcaagaccagcctggataacagtgagaccccatctctacaaaatataaacaattatgtgagcatggtggtgcacacctgtagtccctgttccttgggaggctgaggccggaggatcccttgagcccaggagttcaaggctgcagagagctgcgattgtgccactgcacactaacctgggagatagagcaagaacttgtctcagaaaaaaaaagtatcaggaaCTAATCTCCAGTCCTATCAAGTTAGGCATAAGGTCAATGTGTGATAGCTGAGTGTCACAGAAACCAAGGACAGGAATGCAACTGCCACTGGGGATGAACTGGAAGTGGGGAGTTAAACCACCTCAGAATGTccccatttttgtttcttctccagATGTGCTGCTTTGCTTTTCCGTATGTTTCTCTACGGACCAGCTACCTCTCCTCTGCCAACAGATCCAAGTTGTGCATGTTATGGGTCCAAACACCACGTGACAAGCCCATTCTTCCAGTTTCTCAGACCAGAAACTGCACTGTCCTCTAACTGCTTCTTCTCCCTCTTGCATCTGGTCCTTGGGGAAATCCTGTTTGCCCGGCCTTCAGCATATATCCACAGTTTAACCTTAACCACTCCTCGCCACCACTCGCGGGGGCGAGCAGCCTTCGCCCCCTGCCTAGATTACTACAGTAACTTCATTGTTCTTTCTACTTCTCTCTTTGCCCCTCTGCTATCTCAAAACAGCATCCAAAATGCACCTAGCAAGAGCATGTCATTCCTCTGCACAAAACTCTccaacttctctctttttttttttttttttttttttgagacggagtctcactctgtcacccaggctggagtgcaatagtgtgatcttggctcactgcaacctccacctcccagattcaagcgattctcctgcctcagcctcctgagtagctgagattacaggttcatgtcaccatgcccggctaatttttgtatttttagtagagacagggtttcaccatgttagtcaggctggtctcgaactcctgaccttgtgatccacccgcctcagcctcccaaagtgctgggattataggcatgagccaccgtgcatgacCAACTTCTCTTTTTGTTCAGAGTAAAAGCCAACGGCCCATGAGGCTTTCCATGGTCACGCCTCCGCTCATTCGCTCTGTGGCTTTGTCTTACACGGGTTCACTCCTCACTGGCCGCCTTGCTGACCCCATAGCTCACGGGCCTTACTCTGCTctcggggcctttgcacttgctccaCTGCAAATGCTCCTCCCCCAGAGGCCTTTGTGGCCCATTCCCTCGGTTCCTTAGGAACAATCCCTTCCCTGGTCAAACCTCCACTGACATCTGTCTCCTtcccttctgaattttttttctccgGTAGTATTTATCACTCTGCTATCCTTAGGATTTCCTTATCTTGTTTATCATCATCTCCTCATCCAGAGcttaagtcctttttttttttttgagatagagtctcgctctgtcgcccaggctggagtgcagtggcgcgatctcgtctcgctgaaagctccacctcccgggttcacgccattctcccgcctcagcctcccgagtagctgggactacaggcactcg", "fragmentNumber": 0, "fragmentStartPosition": 0, "fragmentLength": 2200, "numberOfFragmentsInContig": 1}


2.Fastq:
val fq1 = ac.loadFastq(fileFq, None, None, ValidationStringency.STRICT)

原数据:
@chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0/1
CTCCTCGCCA
+
2222222222


读取结果:
{"readNum": null, "contig": null, "start": null, "oldPosition": null, "end": null, "mapq": null, "readName": "chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0", "sequence": "CTCCTCGCCA", "qual": "2222222222", "cigar": null, "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": null, "readMapped": false, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": null, "mateNegativeStrand": null, "primaryAlignment": null, "secondaryAlignment": null, "supplementaryAlignment": null, "mismatchingPositions": null, "origQual": null, "attributes": null, "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}


3.SAM:
println("SAM:");
val sam1=ac.loadAlignments(fileSam)
sam1.foreach(println)

源数据:

@SQ SN:chrUn_KN707606v1_decoy LN:2200
@PG ID:bwa PN:bwa VN:0.7.12-r1039 CL:bwa samse -f hs38DHL1F1Len10.sam hs38DHL1.fa hs38DHL1F1Len10.sai hs38DHL1F1Len10.fq
chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0 0 chrUn_KN707606v1_decoy 1204 37 10M * 0 0 CTCCTCGCCA 2222222222 XT:A:U NM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:10


读取结果:
{"readNum": 0, "contig": {"contigName": "chrUn_KN707606v1_decoy", "contigLength": 2200, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": 0}, "start": 1203, "oldPosition": null, "end": 1213, "mapq": 37, "readName": "chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0", "sequence": "CTCCTCGCCA", "qual": "2222222222", "cigar": "10M", "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": true, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": "10", "origQual": null, "attributes": "XT:A:U\tXO:i:0\tXM:i:0\tNM:i:0\tXG:i:0\tX1:i:0\tX0:i:1", "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}


4.BAM:
println("BAM:");
println("methods 1:");
val bam1 = ac.loadAlignments(fileBam)
bam1.foreach(println)
println("methods 2:");
val bam2 = ac.loadBam(fileBam)

BAM是SAM的二进制,直接打开乱码

读取结果:

BAM:
methods 1:
{"readNum": 0, "contig": {"contigName": "chrUn_KN707606v1_decoy", "contigLength": 2200, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": 0}, "start": 1203, "oldPosition": null, "end": 1213, "mapq": 37, "readName": "chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0", "sequence": "CTCCTCGCCA", "qual": "2222222222", "cigar": "10M", "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": true, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": "10", "origQual": null, "attributes": "XT:A:U\tXO:i:0\tXM:i:0\tNM:i:0\tXG:i:0\tX1:i:0\tX0:i:1", "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}
methods 2:
{"readNum": 0, "contig": {"contigName": "chrUn_KN707606v1_decoy", "contigLength": 2200, "contigMD5": null, "referenceURL": null, "assembly": null, "species": null, "referenceIndex": 0}, "start": 1203, "oldPosition": null, "end": 1213, "mapq": 37, "readName": "chrUn_KN707606v1_decoy_1204_1728_0:0:0_1:0:0_0", "sequence": "CTCCTCGCCA", "qual": "2222222222", "cigar": "10M", "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPaired": false, "properPair": false, "readMapped": true, "mateMapped": false, "failedVendorQualityChecks": false, "duplicateRead": false, "readNegativeStrand": false, "mateNegativeStrand": false, "primaryAlignment": true, "secondaryAlignment": false, "supplementaryAlignment": false, "mismatchingPositions": "10", "origQual": null, "attributes": "XT:A:U\tXO:i:0\tXM:i:0\tNM:i:0\tXG:i:0\tX1:i:0\tX0:i:1", "recordGroupName": null, "recordGroupSequencingCenter": null, "recordGroupDescription": null, "recordGroupRunDateEpoch": null, "recordGroupFlowOrder": null, "recordGroupKeySequence": null, "recordGroupLibrary": null, "recordGroupPredictedMedianInsertSize": null, "recordGroupPlatform": null, "recordGroupPlatformUnit": null, "recordGroupSample": null, "mateAlignmentStart": null, "mateAlignmentEnd": null, "mateContig": null, "inferredInsertSize": null}


BAM跟SAM读取结果是一样的,本来内容也是一样的
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息