warehouse/ingest-configuration/src/main/resources/config/mycsv-ingest-config.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>

    <property>
        <name>file.input.format</name>
        <value>datawave.ingest.csv.mr.input.CSVFileInputFormat</value>
    </property>

    <property>
        <name>data.name</name>
        <value>mycsv</value>
        <description>This is the name of the datatype, which distinguishes it from other types for the purposes of ingest
            processing and perhaps even for dataflow/transport concerns. As such, this can be used to denote a subtype of
            some common data format, like CSV files for example, which could originate from any number of sources
        </description>
    </property>

    <property>
        <name>mycsv.output.name</name>
        <value>csv</value>
        <description>This is the name to use on the data in Accumulo. If not specified, 'data.name' is used by default
        </description>
    </property>

    <property>
        <name>mycsv.ingest.policy.enforcer.class</name>
        <value>datawave.policy.ExampleIngestPolicyEnforcer</value>
        <description>Name of the class to use for record-level policy enforcement.
            (1) datawave.policy.IngestPolicyEnforcer$NoOpIngestPolicyEnforcer will assume all records are valid.
            (2) datawave.policy.ExampleIngestPolicyEnforcer will perform some validations that you'd probably want
            to enforce in a production deployment
        </description>
    </property>

    <property>
        <name>mycsv.ingest.helper.class</name>
        <value>datawave.ingest.csv.config.helper.ExtendedCSVIngestHelper</value>
    </property>

    <property>
        <name>mycsv.reader.class</name>
        <value>datawave.ingest.csv.mr.input.CSVRecordReader</value>
    </property>

    <property>
        <name>mycsv.handler.classes</name>
        <value>datawave.ingest.csv.mr.handler.ContentCSVColumnBasedHandler</value>
        <description>List of classes that should process this event</description>
    </property>

    <property>
        <name>mycsv.event.validators</name>
        <value></value>
    </property>

    <property>
        <name>mycsv.ingest.fatal.errors</name>
        <value>MISSING_DATA_ERROR,INVALID_DATA_ERROR</value>
        <description>This is the list of comma delimited additional RawDataError enumerations that we consider fatal.
               MISSING_DATA_ERROR: Missing required fields
               INVALID_DATA_ERROR: Found a value in a range of invalid values
        </description>
    </property>

    <property>
        <name>mycsv.ingest.ignorable.error.helpers</name>
        <value>datawave.ingest.data.config.ingest.IgnorableFatalErrorHelper</value>
    </property>

    <property>
        <name>mycsv.ingest.ignorable.fatal.errors</name>
<!--    <value>MISSING_DATA_ERROR,INVALID_DATA_ERROR</value> -->
<!--  
     Note: The sample file, ingest-csv/src/test/resources/input/my.csv, is designed to generate
     a 'MISSING_DATA_ERROR' in order to demonstrate population of the error* data tables. Note that the 3rd record
     in that file is *missing* the SECURITY_MARKING field in the header, which is detected and flagged by the configured 
     IngestPolicyEnforcer implementation
-->
        <value>INVALID_DATA_ERROR</value>
        <description>This is the list of comma delimited RawDataError values that result in dropping the event
            instead of redirecting to the error* tables. Used by the IgnorableFatalErrorHelper.
        </description>
    </property>

    <property>
        <name>mycsv.data.category.uuid.fields</name>
        <value>UUID,PARENT_UUID</value>
        <description>List of fields that contain UUIDs</description>
    </property>

    <property>
        <name>mycsv.data.separator</name>
        <value>,</value>
        <description>This is the separator to use for delimited text, and between configuration file parameters with multiple values.
        </description>
    </property>

    <property>
        <name>mycsv.data.default.normalization.failure.policy</name>
        <value>DROP</value>
        <description>For normalization failures: DROP, LEAVE, FAIL</description>
    </property>

    <property>
        <name>mycsv.data.category.id.field</name>
        <value>EVENT_ID</value>
        <description>Name of the field that contains the event id</description>
    </property>

    <property>
        <name>mycsv.data.header</name>
        <value>PROCESSING_DATE,EVENT_ID,LANGUAGE,ORIGINAL_SIZE,PROCESSED_SIZE,FILE_TYPE,MD5,SHA1,SHA256,EVENT_DATE,SECURITY_MARKING</value>
        <description>These are the fields in the raw data (delimited by separator).</description>
    </property>
    
<!--     <property> -->
<!--         <name>mycsv.data.category.marking.default</name> -->
<!--         <value>PRIVATE</value> -->
<!--         <description>Default ColumnVisibility to be applied to fields/records if none provided in the data</description> -->
<!--     </property> -->
        
    <property>
        <name>mycsv.data.category.security.field.names</name>
        <value>SECURITY_MARKING</value>
        <description>These are the fields in the raw data that contain the event's security markings.
        Item N in this comma-delimited list must have a corresponding item N in the *.data.category.security.field.domains list
        </description>
    </property>
    
    <property>
        <name>mycsv.data.category.security.field.domains</name>
        <value>columnVisibility</value>
        <description>Marking domain N in this comma-delimited list corresponds to the security marking 
           field N configured in the *.data.category.security.field.names list 
        </description>
    </property>
    
    <property>
        <name>mycsv.data.process.extra.fields</name>
        <value>true</value>
        <description>Are fields past the header to be processed as name=value pairs.</description>
    </property>

    <property>
        <name>mycsv.data.field.drop</name> 
        <value></value>
    </property>
 
	<!-- default is to treat all fields as multivalued, this is the blacklist -->
    <property>
        <name>mycsv.data.fields.multivalued.blacklist</name>
        <value>RAW_TEXT_BLOB</value>
    </property>

    <property>
        <name>mycsv.data.multivalued.separator</name>
        <value>;</value>
    </property>

    <property>
        <name>mycsv.data.multivalued.threshold</name>
        <value>100000</value>
    </property>

    <property>
        <name>mycsv.data.multivalued.threshold.action</name>
        <value>TRUNCATE</value>
        <description>DROP, TRUNCATE, REPLACE, or FAIL</description>
    </property>

    <property>
        <name>mycsv.data.field.length.threshold</name>
        <value>10000</value>
    </property>

    <property>
        <name>mycsv.data.threshold.action</name>
        <value>TRUNCATE</value>
        <description>DROP, TRUNCATE, REPLACE, or FAIL</description>
    </property>

    <property>
        <name>mycsv.data.category.field.aliases</name>
        <value>FILETYPE:FILE_TYPE</value>
        <description>These are the fields aliases (delimited by first separator).
        </description>
    </property>

    <property>
        <name>mycsv.data.category.index.blacklist</name>
        <value>FIELDNAME1_WE_DONT_WANT_INDEXED</value>
        <description>These are the fields to *not* index</description>
    </property>

    <property>
        <name>mycsv.data.category.index.reverse.blacklist</name>
        <value>FIELDNAME1_WE_DONT_WANT_INDEXED,SECURITY_MARKING,EVENT_DATE,PROCESSING_DATE</value>
        <description>These are the fields to *not* reverse index. Contains the same fields from *.index.blacklist and date, time, numeric, or hash-based fields</description>
    </property>

    <property>
        <name>mycsv.data.category.index.tokenize.blacklist</name>
        <value>FIELDNAME1_WE_DONT_WANT_INDEXED,FIELDNAME1_NOT_TO_TOKENIZE,UUID,PARENT_UUID,SECURITY_MARKING,EVENT_DATE,PROCESSING_DATE,EVENT_ID</value>
        <description>These are the fields to *not* tokenize and index.</description>
    </property>
    
    <property>
        <name>mycsv.data.category.index.reverse.tokenize.blacklist</name>
        <value>FIELDNAME1_WE_DONT_WANT_INDEXED,FIELDNAME1_NOT_TO_TOKENIZE,UUID,PARENT_UUID,SECURITY_MARKING,EVENT_DATE,PROCESSING_DATE,EVENT_ID</value>
        <description>These are the fields to *not* tokenize and index. Contains the same fields from *.index.blacklist and date, time, numeric, or hash-based fields</description>
    </property>
    
    <property>
        <name>mycsv.stopword.list.file</name>
        <value></value>
        <description>The stopword file to use. Default stop words will be used if none specified here</description>
    </property>
    
    <property>
        <name>mycsv.data.category.token.fieldname.designator.enabled</name>
        <value>false</value>
        <description>whether token fields should be named differently than their source fields</description>
    </property>

    <property>
        <name>mycsv.verbose.shard.counters</name>
        <value>false</value>
        <description>In the MapReduce ingest jobs, whether or not to increment
            counters for each "target" shard data will be loaded to. Warning:
            this is rather expensive and should not be 'true' for production
        </description>
    </property>

    <property>
        <name>mycsv.verbose.term.size.counters</name>
        <value>false</value>
        <description>In the MapReduce ingest jobs, whether or not to increment
            counters for the various term sizes. Warning: this is rather
            expensive and should not be 'true' for production
        </description>
    </property>

    <property>
        <name>mycsv.verbose.term.index.counters</name>
        <value>false</value>
        <description>In the MapReduce ingest jobs, whether or not to increment
            counters for the various term types. Warning: this is rather
            expensive and should not be 'true' for production
        </description>
    </property>

    <property>
        <name>mycsv.tokenizer.time.error.threshold.msec</name>
        <value>300000</value>
        <description>If a single document's tokenization exceeds this
            threshold, an exception will be thrown, causing the document to be
            placed in the processingErrors table (5*60*1000 = 300000 msec, 5
            minutes, which keeps it under the 10 minute hadoop job timeout
            threshold)
        </description>
    </property>

    <property>
        <name>mycsv.tokenizer.time.warn.threshold.msec</name>
        <value>150000</value>
        <description>If a single document's tokenization exceeds this threshold, a warning will be logged (2.5*60*1000 = 150000 msec, 2.5 minutes)</description>
    </property>

    <property>
        <name>mycsv.tokenizer.time.thresholds.msec</name>
        <value>1000,5000,10000,30000,60000,150000,300000</value>
        <description>time thresholds for tokenizer time histogram</description>
    </property>

    <property>
        <name>mycsv.tokenizer.time.threshold.names</name>
        <value>1s,5s,10s,30s,1m,2.5m,5m</value>
        <description>display values for time thresholds for tokenizer time histogram</description>
    </property>

    <property>
        <name>mycsv.verbose.term.index.counters</name>
        <value>true</value>
        <description></description>
    </property>

    <property>
        <name>mycsv.verbose.term.size.counters</name>
        <value>true</value>
        <description></description>
    </property>

    <property>
        <name>mycsv.data.default.type.class</name>
        <value>datawave.data.type.LcNoDiacriticsType</value>
        <description>Default type</description>
    </property>

   <property>
        <name>mycsv.data.default.type.class</name>
        <value>datawave.data.type.LcNoDiacriticsType</value>
        <description>Default type</description>
    </property>
    
    <property>
        <name>mycsv.data.category.date</name>
        <value>EVENT_DATE</value>
    </property>

    <property>
        <name>mycsv.data.category.date.formats</name>
        <value>yyyy-MM-dd'T'HH:mm:ss'Z',yyyy-MM-dd HH:mm:ss</value>
    </property>

    <property>
        <name>mycsv.data.id.parser.EVENT_ID.1</name>
        <value>datawave.ingest.metadata.id.DateIdParser("....\\.[a-zA-Z]([0-9]{7}).*", "yyyyDDD")</value>
    </property>
    
    <property>
        <name>mycsv.data.type.field.name</name>
        <value>DATA_TYPE</value>
        <description>where to look for a data type override value</description>
    </property>
    
    <property>
        <name>mycsv.MY_DATE.data.field.type.class</name>
        <value>datawave.data.type.DateType</value>
    </property>
    
    <property>
        <name>mycsv.date.index.type.to.field.map</name>
        <value>MY_DATE_TYPE=MY_DATE</value>
    </property>
    
</configuration>