data-model-generator

Data model generator based on Scala case classes.

Build Status codecov.io License

Table of contents

Goals

  • Generate data model (e.g. DDL, avro schema, Elasticsearch mapping) based on Scala case classes

Getting started

Include dependency:

"com.github.piotr-kalanski" % "data-model-generator_2.11" % "0.7.7"

or

<dependency>
    <groupId>com.github.piotr-kalanski</groupId>
    <artifactId>data-model-generator_2.11</artifactId>
    <version>0.7.7</version>
</dependency>

Dialects

H2 dialect

import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])

object H2Example extends App {
  println(DataModelGenerator.generate[Book](dialects.H2Dialect))
}
CREATE TABLE Book(
   title VARCHAR,
   year INT,
   owner OTHER,
   authors ARRAY
);

Hive dialect

import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])

object HiveExample extends App {
  println(DataModelGenerator.generate[Book](dialects.HiveDialect))
}
CREATE TABLE Book(
   title STRING,
   year INT,
   owner STRUCT<name : STRING, age : INT>,
   authors ARRAY<STRUCT<name : STRING, age : INT>>
);

Redshift dialect

import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])

object RedshiftExample extends App {
  println(DataModelGenerator.generate[Book](dialects.RedshiftDialect))
}
CREATE TABLE Book(
   title VARCHAR,
   year INTEGER,
   owner VARCHAR,
   authors VARCHAR
);

MySQL dialect

import com.datawizards.dmg.{DataModelGenerator, dialects}

case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])

object MySQLExample extends App {
  println(DataModelGenerator.generate[Book](dialects.MySQLDialect))
}
CREATE TABLE Book(
   title VARCHAR,
   year INTEGER,
   owner JSON,
   authors JSON
);

Avro schema dialect

Avro schema

case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])

DataModelGenerator.generate[Book](dialects.AvroSchemaDialect)
{
   "namespace": "com.datawizards.dmg.examples",
   "type": "record",
   "name": "Book",
   "fields": [
      {"name": "title", "type": "string"},
      {"name": "year", "type": "int"},
      {"name": "owner", "type": "record", "fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "int"}]},
      {"name": "authors", "type": "array", "items": {"type": "record", "fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "int"}]}}
   ]
}

Avro schema for Avro Schema Registry

case class Person(name: String, age: Int, skills: Seq[String])

DataModelGenerator.generate[Person](dialects.AvroSchemaRegistryDialect)
{"schema":
"{
   \"namespace\": \"com.datawizards.dmg.examples\",
   \"type\": \"record\",
   \"name\": \"Person\",
   \"fields\": [
      {\"name\": \"name\", \"type\": \"string\"},
      {\"name\": \"age\", \"type\": \"int\"},
      {\"name\": \"skills\", \"type\": \"array\", \"items\": \"string\"}
   ]
}"
}

Elasticsearch dialect

case class Person(name: String, age: Int)
case class Book(title: String, year: Int, owner: Person, authors: Seq[Person])

DataModelGenerator.generate[Book](dialects.ElasticsearchDialect)
{
   "mappings" : {
      "Book" : {
         "properties" : {
            "title" : {"type" : "string"},
            "year" : {"type" : "integer"},
            "owner" : {
               "properties" : {
                  "name" : {"type" : "string"},
                  "age" : {"type" : "integer"}
               }
            },
            "authors" : {
               "properties" : {
                  "name" : {"type" : "string"},
                  "age" : {"type" : "integer"}
               }
            }
         }
      }
   }
}

Java dialect

case class Person(name: String, age: Int)

DataModelGenerator.generate[Person](dialects.Java)
public class Person {
   private String name;
   private Integer age;

   public Person() {}

   public Person(String name, Integer age) {
      this.name = name;
      this.age = age;
   }

   public String getName() {
      return name;
   }

   public void setName(String name) {
      this.name = name;
   }

   public Integer getAge() {
      return age;
   }

   public void setAge(Integer age) {
      this.age = age;
   }
}

Installers

Library enables installing generated data model at target data store e.g. registering generated avro schema at Avro Schema Registry, creating Elasticsearch index or creating Hive table.

Register Avro schema to Avro schema registry

import com.datawizards.dmg.service.AvroSchemaRegistryServiceImpl

case class Person(name: String, age: Int)

object RegisterAvroSchema extends App {
  val service = new AvroSchemaRegistryServiceImpl("http://localhost:8081")
  service.registerSchema[Person]("person")

  println("Subjects:")
  println(service.subjects())

  println("Registered schema:")
  println(service.fetchSchema("person"))
}
"Subjects:"
["person"]
"Registered schema:"
{"type":"record","name":"Person","namespace":"com.datawizards.dmg.examples","fields":[{"name":"name","type":"string"},{"name":"age","type":"int"}]}

Copy Avro schema to HDFS

import com.datawizards.dmg.service.AvroSchemaRegistryServiceImpl

case class Person(name: String, age: Int)

object CopyAvroSchemaToHDFS extends App {
  val service = new AvroSchemaRegistryServiceImpl("http://localhost:8081")
  service.copyAvroSchemaToHdfs[Person]("/metadata/schemas/person")
}

Create Elasticsearch index

import com.datawizards.dmg.service.ElasticsearchServiceImpl

case class Person(name: String, age: Int)

object CreateElasticsearchIndex extends App {
  val service = new ElasticsearchServiceImpl("http://localhost:9200")
  service.createIndex[Person]("person")

  println("Index:")
  println(service.getIndexSettings("person"))
}

Create Elasticsearch template

import com.datawizards.dmg.examples.TestModel.PersonWithMultipleEsAnnotations
import com.datawizards.dmg.service.ElasticsearchServiceImpl

object CreateElasticsearchTemplate extends App {
  val service = new ElasticsearchServiceImpl("http://localhost:9200")
  service.updateTemplate[PersonWithMultipleEsAnnotations]("people")

  println("Template:")
  println(service.getTemplate("people"))
}

Create Hive table

import com.datawizards.dmg.service.HiveServiceImpl

HiveServiceImpl.createHiveTable[Person]()

Extracting class metadata

To extract class metadata you can use method MetaDataWithDialectExtractor.extractClassMetaDataForDialect. Example:

MetaDataWithDialectExtractor.extractClassMetaDataForDialect[T](Some(dialects.HiveDialect))

Customizations

Custom column name

import com.datawizards.dmg.annotations._

case class Person(
  @column(name="personName")
  name: String,
  age: Int
)

DataModelGenerator.generate[Person](dialects.H2Dialect)
CREATE TABLE Person(
   personName VARCHAR,
   age INT
);

Custom column name specific for dialect

import com.datawizards.dmg.annotations._

case class Person(
  @column(name="NAME")
  @column(name="personName", dialects.ElasticsearchDialect)
  name: String,
  @column(name="AGE")
  @column(name="personAge", dialects.ElasticsearchDialect)
  age: Int
)

DataModelGenerator.generate[Person](dialects.H2Dialect)
DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
CREATE TABLE PEOPLE(
   NAME VARCHAR,
   AGE INT
);
{
   "mappings" : {
      "person" : {
         "personName" : {"type" : "string"},
         "personAge" : {"type" : "integer"}
      }
   }
}

Custom table name

import com.datawizards.dmg.annotations._

@table("PEOPLE")
case class Person(
  name: String,
  age: Int
)

DataModelGenerator.generate[Person](dialects.H2Dialect)
CREATE TABLE PEOPLE(
   name VARCHAR,
   age INT
);

Custom table name specific for dialect

import com.datawizards.dmg.annotations._

@table("PEOPLE")
@table("person", dialects.ElasticsearchDialect)
case class Person(
  name: String,
  age: Int
)

DataModelGenerator.generate[Person](dialects.H2Dialect)
DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
CREATE TABLE PEOPLE(
   name VARCHAR,
   age INT
);
{
   "mappings" : {
      "person" : {
         "name" : {"type" : "string"},
         "age" : {"type" : "integer"}
      }
   }
}

Placeholders

data-model-generator supports placeholder variables when generating data model. Placeholder variables can be used in any annotation.

Example use case for placeholder variables is to use them for generating table name dependent on environment. For example, each environment has dedicated DB schema e.g. development, uat, production.

@table("${environment}.people")
case class Person(
    name: String,
    age: Int
)

TemplateHandler.inflate(DataModelGenerator.generate[Person](H2Dialect), Map("environment" -> "development"))
CREATE TABLE development.people(
   name VARCHAR,
   age INT
);
TemplateHandler.inflate(DataModelGenerator.generate[Person](H2Dialect), Map("environment" -> "production"))
CREATE TABLE production.people(
   name VARCHAR,
   age INT
);

Documentation comments

@comment("People data")
case class PersonWithComments(
    @comment("Person name") name: String,
    age: Int
)

H2

DataModelGenerator.generate[PersonWithComments](dialects.H2Dialect)
CREATE TABLE PersonWithComments(
   name VARCHAR COMMENT 'Person name',
   age INT
);
COMMENT ON TABLE PersonWithComments IS 'People data';

Hive

DataModelGenerator.generate[PersonWithComments](dialects.HiveDialect)
CREATE TABLE PersonWithComments(
   name STRING COMMENT 'Person name',
   age INT
)
COMMENT 'People data';

Redshift

DataModelGenerator.generate[PersonWithComments](dialects.RedshiftDialect)
CREATE TABLE PersonWithComments(
   name VARCHAR,
   age INTEGER
);
COMMENT ON TABLE PersonWithComments IS 'People data';
COMMENT ON COLUMN PersonWithComments.name IS 'Person name';

Avro schema

DataModelGenerator.generate[PersonWithComments](dialects.AvroSchemaDialect)
{
   "namespace": "com.datawizards.dmg.examples",
   "type": "record",
   "name": "PersonWithComments",
   "doc": "People data",
   "fields": [
      {"name": "name", "type": "string", "doc": "Person name"},
      {"name": "age", "type": "int"}
   ]
}

Column length

import com.datawizards.dmg.annotations._

case class Person(
  @length(1000) name: String,
  age: Int
)

DataModelGenerator.generate[Person](dialects.H2Dialect)
CREATE TABLE PEOPLE(
   name VARCHAR(1000),
   age INT
);

Not null

import com.datawizards.dmg.annotations._

case class Person(
  @notNull name: String,
  age: Int
)

DataModelGenerator.generate[Person](dialects.H2Dialect)
DataModelGenerator.generate[Person](dialects.RedshiftDialect)
DataModelGenerator.generate[Person](dialects.AvroSchemaDialect)

H2 - not null

CREATE TABLE PersonWithNull(
   name VARCHAR NOT NULL,
   age INT
);

Redshift - not null

CREATE TABLE PersonWithNull(
   name VARCHAR NOT NULL,
   age INTEGER
);

Avro schema - not null

{
   "namespace": "com.datawizards.dmg",
   "type": "record",
   "name": "PersonWithNull",
   "fields": [
      {"name": "name", "type": "string"},
      {"name": "age", "type": ["null", "int"]}
   ]
}

Underscore

Convert table and column names for selected dialect to underscore convention.

@underscore(dialect=dialects.H2Dialect)
case class PersonWithUnderscore(
    personName: String,
    personAge: Int
)
CREATE TABLE person_with_underscore(
   person_name VARCHAR,
   person_age INT
);

Hive customizations

Hive external table

@hiveExternalTable(location="hdfs:///data/people")
case class Person(name: String, age: Int)

DataModelGenerator.generate[Person](dialects.HiveDialect)
CREATE EXTERNAL TABLE Person(
   name STRING,
   age INT
)
LOCATION 'hdfs:///data/people';

Hive ROW FORMAT SERDE

@hiveRowFormatSerde(format="org.apache.hadoop.hive.serde2.avro.AvroSerDe")
case class Person(name: String, age: Int)

DataModelGenerator.generate[Person](dialects.HiveDialect)
CREATE TABLE Person(
   name STRING,
   age INT
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe';

Hive STORED AS

@hiveStoredAs(format="PARQUET")
case class Person(name: String, age: Int)

DataModelGenerator.generate[Person](dialects.HiveDialect)
CREATE TABLE Person(
   name STRING,
   age INT
)
STORED AS PARQUET;

Hive TABLE PROPERTIES

@hiveTableProperty("key1", "value1")
@hiveTableProperty("key2", "value2")
@hiveTableProperty("key3", "value3")
case class Person(name: String, age: Int)

DataModelGenerator.generate[Person](dialects.HiveDialect)
CREATE TABLE Person(
   name STRING,
   age INT
)
TBLPROPERTIES(
   'key1' = 'value1',
   'key2' = 'value2',
   'key3' = 'value3'
);

Hive avro schema url property

@hiveTableProperty("avro.schema.url", "hdfs:///metadata/person.avro")
case class Person(name: String, age: Int)

DataModelGenerator.generate[Person](dialects.HiveDialect)

If "avro.schema.url" table property is provided then generated data model doesn't have any columns definitions, because they are taken by Hive from avro schema.

CREATE TABLE Person
TBLPROPERTIES(
   'avro.schema.url' = 'hdfs:///metadata/person.avro'
);

Hive partition columns

case class ClicksPartitioned(
    time: Timestamp,
    event: String,
    user: String,
    @hivePartitionColumn
    year: Int,
    @hivePartitionColumn
    month: Int,
    @hivePartitionColumn
    day: Int
)

DataModelGenerator.generate[ClicksPartitioned](dialects.HiveDialect)
CREATE TABLE ClicksPartitioned(
   time TIMESTAMP,
   event STRING,
   user STRING
)
PARTITIONED BY(year INT, month INT, day INT);

Hive partition columns - order

case class ClicksPartitioned(
    time: Timestamp,
    event: String,
    user: String,
    @hivePartitionColumn(order=3)
    day: Int,
    @hivePartitionColumn(order=1)
    year: Int,
    @hivePartitionColumn(order=2)
    month: Int
)

DataModelGenerator.generate[ClicksPartitionedWithOrder](dialects.HiveDialect)
CREATE TABLE ClicksPartitionedWithOrder(
   time TIMESTAMP,
   event STRING,
   user STRING
)
PARTITIONED BY(year INT, month INT, day INT);

Hive Parquet table with many annotations

@table("CUSTOM_TABLE_NAME")
@comment("Table comment")
@hiveStoredAs(format="PARQUET")
@hiveExternalTable(location="hdfs:///data/table")
@hiveTableProperty("key1", "value1")
@hiveTableProperty("key2", "value2")
@hiveTableProperty("key3", "value3")
case class ParquetTableWithManyAnnotations(
    @column("eventTime")
    @comment("Event time")
    time: Timestamp,
    @comment("Event name")
    event: String,
    @comment("User id")
    user: String,
    @hivePartitionColumn(order=3)
    day: Int,
    @hivePartitionColumn(order=1)
    year: Int,
    @hivePartitionColumn(order=2)
    month: Int
)

DataModelGenerator.generate[ParquetTableWithManyAnnotations](dialects.HiveDialect)
CREATE EXTERNAL TABLE CUSTOM_TABLE_NAME(
   eventTime TIMESTAMP COMMENT 'Event time',
   event STRING COMMENT 'Event name',
   user STRING COMMENT 'User id'
)
COMMENT 'Table comment'
PARTITIONED BY(year INT, month INT, day INT)
STORED AS PARQUET
LOCATION 'hdfs:///data/table'
TBLPROPERTIES(
   'key1' = 'value1',
   'key2' = 'value2',
   'key3' = 'value3'
);

Hive Avro table with many annotations

@table("CUSTOM_TABLE_NAME")
@comment("Table comment")
@hiveRowFormatSerde(format="org.apache.hadoop.hive.serde2.avro.AvroSerDe")
@hiveStoredAs("INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'")
@hiveExternalTable(location="hdfs:///data/table")
@hiveTableProperty("avro.schema.url", "hdfs:///metadata/table.avro")
@hiveTableProperty("key1", "value1")
@hiveTableProperty("key2", "value2")
@hiveTableProperty("key3", "value3")
case class AvroTableWithManyAnnotations(
    @column("eventTime")
    @comment("Event time")
    time: Timestamp,
    @comment("Event name")
    event: String,
    @comment("User id")
    user: String,
    @hivePartitionColumn(order=3)
    day: Int,
    @hivePartitionColumn(order=1)
    year: Int,
    @hivePartitionColumn(order=2)
    month: Int
)

DataModelGenerator.generate[AvroTableWithManyAnnotations](dialects.HiveDialect)
CREATE EXTERNAL TABLE CUSTOM_TABLE_NAME
COMMENT 'Table comment'
PARTITIONED BY(year INT, month INT, day INT)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION 'hdfs:///data/table'
TBLPROPERTIES(
   'avro.schema.url' = 'hdfs:///metadata/table.avro',
   'key1' = 'value1',
   'key2' = 'value2',
   'key3' = 'value3'
);

Elasticsearch customizations

index settings

@esSetting("number_of_shards", 1)
@esSetting("number_of_replicas", 3)
@esSetting("blocks.read_only", true)
@esSetting("codec", "best_compression")
case class Person(name: String, age: Int)

DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
{
   "settings" : {
      "number_of_shards" : 1,
      "number_of_replicas" : 3,
      "blocks.read_only" : "true",
      "codec" : "best_compression"
   },
   "mappings" : {
      "Person" : {
         "properties" : {
            "name" : {"type" : "string"},
            "age" : {"type" : "integer"}
         }
      }
   }
}

index parameter

Index parameter: https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-index.html

case class Person(
    @esIndex("not_analyzed") name: String,
    age: Int
)

DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
{
   "mappings" : {
      "PersonEsIndexSettings" : {
         "properties" : {
            "name" : {"type" : "string", "index" : "not_analyzed"},
            "age" : {"type" : "integer"}
         }
      }
   }
}

format parameter

Date format parameter: https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html

case class Person(
    name: String,
    @esFormat("yyyy-MM-dd") birthday: Date
)

DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
{
   "mappings" : {
      "Person" : {
         "properties" : {
            "name" : {"type" : "string"},
            "birthday" : {"type" : "date", "format" : "yyyy-MM-dd"}
         }
      }
   }
}

Elasticsearch template

https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-templates.html

@esTemplate("people*")
case class PersonWithEsTemplate(name: String, age: Int)

DataModelGenerator.generate[Person](dialects.ElasticsearchDialect)
{
   "template" : "people*",
   "mappings" : {
      "PersonWithEsTemplate" : {
         "properties" : {
            "name" : {"type" : "string"},
            "age" : {"type" : "integer"}
         }
      }
   }
}

Elasticsearch multiple annotations

@table("people")
@esTemplate("people*")
@esSetting("number_of_shards", 1)
@esSetting("number_of_replicas", 3)
case class PersonWithMultipleEsAnnotations(
    @esIndex("not_analyzed")
    @column("personName")
    name: String,
    @column("personBirthday")
    @esFormat("yyyy-MM-dd")
    birthday: Date
)

DataModelGenerator.generate[PersonWithMultipleEsAnnotations](dialects.ElasticsearchDialect)
{
   "template" : "people*",
   "settings" : {
      "number_of_shards" : 1,
      "number_of_replicas" : 3
   },
   "mappings" : {
      "people" : {
         "properties" : {
            "personName" : {"type" : "string", "index" : "not_analyzed"},
            "personBirthday" : {"type" : "date", "format" : "yyyy-MM-dd"}
         }
      }
   }
}