heuermh / adam-gfa   0.8.0

GNU Lesser General Public License v3.0 only GitHub

Graphical Fragment Assembly (GFA) support for ADAM. Licensed LGPL version 3 or later.

Scala versions: 2.12 2.11

adam-gfa

Graphical Fragment Assembly (GFA) support for ADAM.

Build Status Maven Central API Documentation

Hacking adam-gfa

Install

To build

$ mvn install

Running adam-gfa

Transform GFA 1.0 to generic Gfa1Record records in Parquet format

$ spark-submit \
    --class com.github.heuermh.adam.gfa.Gfa1ToDataframe \
    target/adam-gfa_2.12-${version}.jar \
    in.gfa \
    out.parquet

Transform GFA 1.0 to specific Containment, Link, Path, Segment, and Traversal records in Parquet format

$ spark-submit \
    --class com.github.heuermh.adam.gfa.Gfa1ToDataframes \
    target/adam-gfa_2.12-${version}.jar \
    in.gfa \
    out

(creates separate out.containments.parquet, out.links.parquet, out.paths.parquet, out.segments.parquet, and out.traversals.parquet directories)

Graphical Fragment Assembly (GFA) version 1.0 schema in Parquet format

Gfa1Record

Gfa1Record (scaladoc)

message spark_schema {
  optional binary recordType (STRING);
  optional binary name (STRING);
  optional binary sequence (STRING);
  optional int32 length;
  optional int32 readCount;
  optional int32 fragmentCount;
  optional int32 kmerCount;
  optional binary sequenceChecksum (STRING);
  optional binary sequenceUri (STRING);
  optional binary stableName (STRING);
  optional int32 stableOffset;
  optional int32 stableRank;
  optional binary id (STRING);
  optional group source {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional group target {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional binary overlap (STRING);
  optional int32 mappingQuality;
  optional int32 mismatchCount;
  optional binary pathName (STRING);
  optional group segments (LIST) {
    repeated group list {
      optional group element {
        optional binary id (STRING);
        optional binary orientation (STRING);
      }
    }
  }
  optional group overlaps (LIST) {
    repeated group list {
      optional binary element (STRING);
    }
  }
  optional int32 ordinal;
  optional group container {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional group contained {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional int32 position;
  optional group annotations (MAP) {
    repeated group key_value {
      required binary key (STRING);
      optional group value {
        optional binary name (STRING);
        optional binary type (STRING);
        optional binary value (STRING);
      }
    }
  }
}

Containment

Link (scaladoc)

message spark_schema {
  optional binary id (STRING);
  optional group container {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional group contained {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional int32 position;
  optional binary overlap (STRING);
  optional int32 mismatchCount;
  optional int32 readCount;
  optional group annotations (MAP) {
    repeated group key_value {
      required binary key (STRING);
      optional group value {
        optional binary name (STRING);
        optional binary type (STRING);
        optional binary value (STRING);
      }
    }
  }
}

Link

Link (scaladoc)

message spark_schema {
  optional binary id (STRING);
  optional group source {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional group target {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional binary overlap (STRING);
  optional int32 mappingQuality;
  optional int32 mismatchCount;
  optional int32 readCount;
  optional int32 fragmentCount;
  optional int32 kmerCount;
  optional group annotations (MAP) {
    repeated group key_value {
      required binary key (STRING);
      optional group value {
        optional binary name (STRING);
        optional binary type (STRING);
        optional binary value (STRING);
      }
    }
  }
}

Path

Path (scaladoc)

message spark_schema {
  optional binary pathName (STRING);
  optional group segments (LIST) {
    repeated group list {
      optional group element {
        optional binary id (STRING);
        optional binary orientation (STRING);
      }
    }
  }
  optional group overlaps (LIST) {
    repeated group list {
      optional binary element (STRING);
    }
  }
  optional group annotations (MAP) {
    repeated group key_value {
      required binary key (STRING);
      optional group value {
        optional binary name (STRING);
        optional binary type (STRING);
        optional binary value (STRING);
      }
    }
  }
}

Segment

Segment (scaladoc)

message spark_schema {
  optional binary name (STRING);
  optional binary sequence (STRING);
  optional int32 length;
  optional int32 readCount;
  optional int32 fragmentCount;
  optional int32 kmerCount;
  optional binary sequenceChecksum (STRING);
  optional binary sequenceUri (STRING);
  optional binary stableName (STRING);
  optional int32 stableOffset;
  optional int32 stableRank;
  optional group annotations (MAP) {
    repeated group key_value {
      required binary key (STRING);
      optional group value {
        optional binary name (STRING);
        optional binary type (STRING);
        optional binary value (STRING);
      }
    }
  }
}

Traversal

Traversal (scaladoc)

message spark_schema {
  optional binary id (STRING);
  optional binary pathName (STRING);
  optional int32 ordinal;
  optional group source {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional group target {
    optional binary id (STRING);
    optional binary orientation (STRING);
  }
  optional binary overlap (STRING);
  optional group annotations (MAP) {
    repeated group key_value {
      required binary key (STRING);
      optional group value {
        optional binary name (STRING);
        optional binary type (STRING);
        optional binary value (STRING);
      }
    }
  }
}