您的位置:首页 > 编程语言 > Go语言

基因数据处理42之mango问题_seqdict.avro不存在解决

2016-05-30 20:09 471 查看
参考【1】中问题解决

问题分析:这是新版本的问题:adam0.19.1目前在maven中心仓库没有

解决办法:

package org.gcdss.test

import java.io.File
import java.nio.file.Files

import org.apache.parquet.hadoop.metadata.CompressionCodecName
import org.apache.spark.{SparkContext, SparkConf}
import org.bdgenomics.adam.rdd.{ADAMSaveAnyArgs}
import org.bdgenomics.adam.rdd.ADAMContext._

/**
* Created by xubo on 2016/5/30.
*/
object fastqSaveAdam0191 {
def resourcePath(path: String) = ClassLoader.getSystemClassLoader.getResource(path).getFile

def tmpFile(path: String) = Files.createTempDirectory("").toAbsolutePath.toString + "/" + path

def main(args: Array[String]) {
val conf = new SparkConf().setMaster("local[4]").setAppName(this.getClass().getSimpleName().filter(!_.equals('$')))
val sc = new SparkContext(conf)

//    val fastqFile = "hs38DHL1F10Len10.fq"
//    val fastqFile = "mouse_chrM.bam"F
val fastqFile = " datatest2.sort.bam"
//    val fastqLoad = sc.loadAlignments(resourcePath(fastqFile))
val fastqLoad = sc.loadAlignments(fastqFile)
//    fastqLoad.rdd.take(10).foreach(println)
//    fastqLoad.rdd.adamParquetSave("adam")

def tempLocation(suffix: String = ".adam"): String = {
val tempFile = File.createTempFile("ADAMContextSuite", "")
val tempDir = tempFile.getParentFile
new File(tempDir, tempFile.getName + suffix).getAbsolutePath
}
val loc = tempLocation()
println(loc)
fastqLoad.rdd.saveAsParquet(TestSaveArgs(loc), fastqLoad.sequences, fastqLoad.recordGroups)
//    fastqLoad.
println("end")
sc.stop

}
}

case class TestSaveArgs(var outputPath: String) extends ADAMSaveAnyArgs {
var sortFastqOutput = false
var asSingleFile = false
var blockSize = 128 * 1024 * 1024
var pageSize = 1 * 1024 * 1024
var compressionCodec = CompressionCodecName.GZIP
var logLevel = "SEVERE"
var disableDictionaryEncoding = false
}


pom配置:

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion>

<groupId>org.gcdss</groupId>
<artifactId>GCDSS</artifactId>
<version>1.0-SNAPSHOT</version>

<properties>
<adam.version>0.19.1-SNAPSHOT</adam.version>
<scala.version>2.10.4</scala.version>
<scala.version.prefix>2.10</scala.version.prefix>
<scalatra.version>2.4.+</scalatra.version>
<spark.version>1.5.2</spark.version>
<parquet.version>1.8.1</parquet.version>
</properties>

<repositories>
<repository>
<id>central</id>
<url>http://repo1.maven.org/maven2/</url>
</repository>
<repository>
<id>Sonatype</id>
<url>http://oss.sonatype.org/content/repositories/snapshots/</url>
</repository>
<repository>
<id>Apache</id>
<url>http://people.apache.org/repo/m2-snapshot-repository</url>
</repository>
<repository>
<id>SparkPackagesRepo</id>
<url>http://dl.bintray.com/spark-packages/maven</url>
</repository>
</repositories>

<build>
<plugins>
<plugin>
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
<configuration>
<reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
<junitxml>.</junitxml>
<filereports>ADAMTestSuite.txt</filereports>
<!--
As explained here: http://stackoverflow.com/questions/1660441/java-flag-to-enable-extended-serialization-debugging-info The second option allows us better debugging for serialization-based errors.
-->
<argLine>-Xmx1024m -Dsun.io.serialization.extendedDebugInfo=true</argLine>
</configuration>
<executions>
<execution>
<id>test</id>
<goals>
<goal>test</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>

<dependencies>
<!--<dependency>-->
<!--<groupId>org.bdgenomics.utils</groupId>-->
<!--<artifactId>utils-misc_2.10</artifactId>-->
<!--<type>test-jar</type>-->
<!--<scope>test</scope>-->
<!--</dependency>-->
<!--<dependency>-->
<!--<groupId>org.bdgenomics.bdg-formats</groupId>-->
<!--<artifactId>bdg-formats</artifactId>-->
<!--</dependency>-->

<dependency>
<groupId>org.bdgenomics.adam</groupId>
<artifactId>adam-core_2.10</artifactId>
<version>${adam.version}</version>
</dependency>
<dependency>
<groupId>org.bdgenomics.adam</groupId>
<artifactId>adam-core_2.10</artifactId>
<version>${adam.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.bdgenomics.adam</groupId>
<artifactId>adam-cli_2.10</artifactId>
<version>${adam.version}</version>
</dependency>
<!--<dependency>-->
<!--<groupId>org.scalatest</groupId>-->
<!--<artifactId>scalatest_2.10</artifactId>-->
<!--<scope>test</scope>-->
<!--</dependency>-->
</dependencies>

</project>


结果:

hadoop@Master:~/xubo/data/mango$ unzip ADAMContextSuite5309214740546591997.adam.zip
Archive:  ADAMContextSuite5309214740546591997.adam.zip
creating: ADAMContextSuite5309214740546591997.adam/
extracting: ADAMContextSuite5309214740546591997.adam/._SUCCESS.crc
extracting: ADAMContextSuite5309214740546591997.adam/._common_metadata.crc
extracting: ADAMContextSuite5309214740546591997.adam/._metadata.crc
extracting: ADAMContextSuite5309214740546591997.adam/._rgdict.avro.crc
extracting: ADAMContextSuite5309214740546591997.adam/._seqdict.avro.crc
extracting: ADAMContextSuite5309214740546591997.adam/.part-r-00000.gz.parquet.crc
extracting: ADAMContextSuite5309214740546591997.adam/_SUCCESS
inflating: ADAMContextSuite5309214740546591997.adam/_common_metadata
inflating: ADAMContextSuite5309214740546591997.adam/_metadata
inflating: ADAMContextSuite5309214740546591997.adam/_rgdict.avro
inflating: ADAMContextSuite5309214740546591997.adam/_seqdict.avro
inflating: ADAMContextSuite5309214740546591997.adam/part-r-00000.gz.parquet


运行环境为idea

参考:

【1】 http://blog.csdn.net/xubo245/article/details/51537256

【2】https://github.com/bigdatagenomics/adam/blob/master/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctions.scala

【3】https://github.com/bigdatagenomics/adam/blob/master/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  mango