warehouse/ingest-configuration/src/main/resources/config/myjson-ingest-config.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!--

   Configuration for ingesting arbitrary Json. For demonstration purposes, this config uses
   the following example data file as the basis for some of the "known field" settings below:

   warehouse/ingest-json/src/test/resources/input/tvmaze-api.json

    - File was populated via "http://api.tvmaze.com/singlesearch/shows?q={SHOW_NAME}&embed=cast"
      using this script:

          datawave-quickstart/bin/services/datawave/ingest-examples/ingest-tv-shows.sh

    - Includes shows: Orange is the New Black, The Middle, Everybody Loves Raymond, The Honeymooners, Westworld,
      The Walking Dead, The Big Bang Theory, Star Trek, T.J. Hooker, American Pickers, Pawn Stars, Seinfeld,
      The Following, and Tremors

   However, the JsonInputFormat, JsonRecordReader, and JsonIngestHelper classes make no
   assumptions about the Json schemas to be parsed. They can be used to ingest a wide
   variety of Json representations with little or no additional customization

   E.g., see 'myjson.data.json.flattener.mode' property below for details on supported
   parsing behaviors. Also, classes and configs may be extended as needed to handle more
   specialized parsing/extraction.

   Also, the script datawave/ingest-examples/ingest-tv-shows.sh will allow you to query api.tvmaze.com for TV shows
   of interest and ingest the resulting JSON data on demand.

-->

<configuration>

    <property>
        <name>file.input.format</name>
        <value>datawave.ingest.json.mr.input.JsonInputFormat</value>
    </property>

    <property>
        <name>data.name</name>
        <value>myjson</value>
        <description>This is the name of the datatype, which distinguishes it from other types for the purposes of ingest
            processing and perhaps even for dataflow/transport concerns. As such, this can be used to denote a subtype of
            some common data format, like CSV files for example, which could originate from any number of sources
        </description>
    </property>

    <property>
        <name>myjson.output.name</name>
        <value>tvmaze</value>
        <description>This is the name to use on the data in Accumulo</description>
    </property>

    <property>
        <name>myjson.ingest.helper.class</name>
        <value>datawave.ingest.json.config.helper.JsonIngestHelper</value>
    </property>

    <property>
        <name>myjson.reader.class</name>
        <value>datawave.ingest.json.mr.input.JsonRecordReader</value>
    </property>

    <property>
        <name>myjson.handler.classes</name>
        <value>datawave.ingest.json.mr.handler.ContentJsonColumnBasedHandler</value>
        <description>List of classes that should process each event</description>
    </property>

    <property>
        <name>myjson.data.category.uuid.fields</name>
        <value>ID,EXTERNALS_THETVDB,EXTERNALS_TVRAGE,EXTERNALS_IMDB</value>
        <description>List of known fields that contain UUIDs</description>
    </property>

    <property>
        <name>myjson.data.separator</name>
        <value>,</value>
        <description>This is the separator to use for delimited text, and between configuration file parameters with multiple values.
        </description>
    </property>

    <property>
        <name>myjson.data.header</name>
        <value>ID,NAME,PREMIERED,RUNTIME,STATUS,SUMMARY,OFFICIALSITE,LANGUAGE,GENRES,WEIGHT,URL,TYPE</value>
        <description>Known metadata fields that may be expected to appear in every json document. Often, these may be
            "required" fields, and/or fields that you want to use for policy enforcement, quality assurance, etc</description>
    </property>

    <property>
        <name>myjson.data.process.extra.fields</name>
        <value>true</value>
        <description>If true, "extra" fields within the json tree (ie, those outside the defined "header") should be
            processed. Otherwise, everything outside the header will be ignored unless explicitly whitelisted</description>
    </property>

    <property>
        <name>myjson.data.json.flattener.mode</name>
        <value>GROUPED_AND_NORMAL</value>
        <description>The classes datawave.ingest.json.mr.input.JsonRecordReader and
            datawave.ingest.json.config.helper.JsonIngestHelper support 4 different json-flattening modes:

        (1) 'SIMPLE' mode:

               Ignores nested objects. Only retrieves root-level primitives, including primitives within root-level arrays.
               Array primitives will be represented as multi-valued keys in the resulting map. If your json is flat already,
               or if you wish to ignore nested objects for whatever reason, this mode should suffice.  If you require
               complete tree traversal, see NORMAL and GROUPED modes.

               PROs:

               A good choice if all of your incoming json is flat. Least amount overhead in terms of json parsing

               CONs:

               A bad choice if you need to ingest json schemas where items of interest are nested

        (2) 'NORMAL' mode:

               Traverses the entire Json tree, unlike SIMPLE mode. Given a nested 'fieldname' property @ level 4 in the
               tree and path delimiter value of '_', the flattened result would take the form:

               GREATGRANDPARENT_GRANDPARENT_PARENT_FIELDNAME = Value

               At query time, the full identifier is used as the search term

               PROs:

               The field names users will employ in queries match the structure of the underlying raw data, so
               DataWave's data element dictionary tends to be self-explanatory and intuitive for end users

               CONs:

               Can lead to very long field names, especially for deeply nested json schemas. Also, if users will
               need the ability to easily disambiguate json elements at query time based their original position within
               the json tree, then 'NORMAL' mode will be problematic, since no context information is retained

        (3) 'GROUPED' mode:

               Full parse as with NORMAL mode, but instead we append the hierarchical context onto 'FIELDNAME' as a
               dot-delimited suffix, with additional information to identify the ordinal position or occurrence at each
               level...

               E.g., FIELDNAME.greatgrandparent_0.grandparent_1.parent_3.fieldname_0 = Value

               ...where '_#' denotes the specific occurrence of the element within the given level of the hierarchy.

               At query time, only the 'FIELDNAME' identifier is used as the search term. If FIELDNAME happens to be
               a repeated field within the JSON document and users will need the ability to disambiguate them at query
               time, DataWave provides filtering functions to support that...

               See datawave-quickstart/bin/services/datawave/test-web/tests/EventQueryJsonGrouped.test for examples

               PROs:

               Allows users to query with the shortest, ie most compact, field names possible. Also tends to
               keep the overall size in bytes of the data dictionary and forward/reverse indices to a minimum,
               since only the 'FIELDNAME' portion of the key is stored in those places; only the 'shard' table
               retains the full context which is used only for filtering purposes as needed

               CONs:

               Since the data dictionary only knows about the 'FIELDNAME' portion of the names, users may lose
               sight of the underlying structure and semantics of the raw data for query purposes. That may lead
               to confusion, particularly if the repository will store data from mutliple distinct sources and those
               sources differ in format/structure yet have 'FIELDNAME' elements in common.

        (4) 'GROUPED_AND_NORMAL' mode:

                Retains the fully-qualified 'NORMAL' mode field name as the prefix, and appends the full context suffix
                as defined by 'GROUPED' mode above, for maximum flexibility

                E.g., GRANDPARENT_PARENT_FIELDNAME.grandparent_0.parent_2.fieldname_0

                PROs:

                As with 'NORMAL' mode, the data dictionary elements and index entries will reflect the structure of
                the incoming json, which may be more intuitive for users, alleviating confusion around field name
                semantics, etc. Also allows users to easily disambiguate values from different parts of the json's
                hierarchical structure, if needed

                CONs:

                Requires the most storage in bytes for fieldname storage in the shard table, since the path information
                is replicated in the suffix along with the ordinal context

        </description>
    </property>

    <!--<property>-->
    <!--<name>myjson.data.category.marking.visibility.field</name>-->
    <!--<value>DOCUMENT_VISIBILITY</value>-->
    <!--<description>Known field in every record that will contain the event's ColumnVisibility for Accumulo.-->
    <!--If the raw data doesn't convey security markings, then utilize the '.data.category.marking.default' property-->
    <!--instead, to declare the default marking to be applied to every field</description>-->
    <!--</property>-->

    <property>
        <name>myjson.data.category.marking.default</name>
        <value>PRIVATE|(BAR&amp;FOO)</value>
        <description>ColumnVisibility expression to be applied to each field, when the raw data is known to provide none</description>
    </property>

    <property>
        <name>myjson.SUMMARY.data.field.marking</name>
        <value>PUBLIC</value>
        <description>ColumnVisibility expression to be applied to the "SUMMARY" field</description>
    </property>

    <!--<property>-->
        <!--<name>myjson.data.category.date</name>-->
        <!--<value>PREMIERED</value>-->
        <!--<description>Known date field to be used, if found, for the shard row id. Otherwise, current date will be used</description>-->
    <!--</property>-->

    <property>
        <name>myjson.data.category.date.formats</name>
        <value>yyyy-MM-dd,yyyy-MM-dd'T'HH:mm:ss'Z',yyyy-MM-dd HH:mm:ss</value>
        <description>Known/valid date formats for *.data.category.date field</description>
    </property>

    <!-- Indexing and tokenization-->

    <property>
        <name>myjson.data.category.index</name>
        <value>NAME,ID,ID,EXTERNALS_THETVDB,EXTERNALS_TVRAGE,EXTERNALS_IMDB,EMBEDDED_CAST_CHARACTER_NAME,EMBEDDED_CAST_PERSON_NAME,EMBEDDED_CAST_PERSON_ID,GENRES,NETWORK_NAME,OFFICIALSITE,TYPE,STATUS,RUNTIME,URL</value>
        <description>List of known fields to index</description>
    </property>

    <property>
        <name>myjson.data.category.index.reverse</name>
        <value>NAME,NETWORK_NAME,OFFICIALSITE,URL</value>
        <description>List of known fields to reverse index</description>
    </property>

    <property>
        <name>myjson.data.category.token.fieldname.designator</name>
        <value>_TOKEN</value>
        <description>Field name suffix to be applied to field names that are tokenized.
            See *.data.category.index.tokenize.whitelist</description>
    </property>

    <property>
        <name>myjson.data.category.index.tokenize.whitelist</name>
        <value>SUMMARY,NETWORK_NAME,NAME,EMBEDDED_CAST_CHARACTER_NAME,EMBEDDED_CAST_PERSON_NAME</value>
        <description>These are the fields to tokenize and index.</description>
    </property>

    <property>
        <name>myjson.data.category.index.only</name>
        <value>SUMMARY_TOKEN,NETWORK_NAME_TOKEN,NAME_TOKEN,EMBEDDED_CAST_CHARACTER_NAME_TOKEN,EMBEDDED_CAST_PERSON_NAME_TOKEN</value>
        <description>Fields that will exist only in the global index. Will not be stored as part of the event/document</description>
    </property>

    <!-- Field Normalization -->

    <property>
        <name>myjson.data.default.normalization.failure.policy</name>
        <value>FAIL</value>
        <description>For field normalization failures: DROP, LEAVE, FAIL.
             FAIL: the entire event/document will be dropped, and possibly written to the error schema in Accumulo.
            LEAVE: the non-normalized value will be kept as-is.
             DROP: the failed field will be dropped, and everything else retained
        </description>
    </property>

    <property>
        <name>myjson.data.default.type.class</name>
        <value>datawave.data.type.LcNoDiacriticsType</value>
        <description>Default type</description>
    </property>

    <property>
        <name>myjson.PREMIERED.data.field.type.class</name>
        <value>datawave.data.type.DateType</value>
    </property>

    <property>
        <name>myjson.WEIGHT.data.field.type.class</name>
        <value>datawave.data.type.NumberType</value>
    </property>

    <property>
        <name>myjson.RUNTIME.data.field.type.class</name>
        <value>datawave.data.type.NumberType</value>
    </property>

</configuration>