forked from NationalSecurityAgency/datawave
-
Notifications
You must be signed in to change notification settings - Fork 0
/
myjson-ingest-config.xml
289 lines (217 loc) · 12.9 KB
/
myjson-ingest-config.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Configuration for ingesting arbitrary Json. For demonstration purposes, this config uses
the following example data file as the basis for some of the "known field" settings below:
warehouse/ingest-json/src/test/resources/input/tvmaze-api.json
- File was populated via "http://api.tvmaze.com/singlesearch/shows?q={SHOW_NAME}&embed=cast"
using this script:
datawave-quickstart/bin/services/datawave/ingest-examples/ingest-tv-shows.sh
- Includes shows: Orange is the New Black, The Middle, Everybody Loves Raymond, The Honeymooners, Westworld,
The Walking Dead, The Big Bang Theory, Star Trek, T.J. Hooker, American Pickers, Pawn Stars, Seinfeld,
The Following, and Tremors
However, the JsonInputFormat, JsonRecordReader, and JsonIngestHelper classes make no
assumptions about the Json schemas to be parsed. They can be used to ingest a wide
variety of Json representations with little or no additional customization
E.g., see 'myjson.data.json.flattener.mode' property below for details on supported
parsing behaviors. Also, classes and configs may be extended as needed to handle more
specialized parsing/extraction.
Also, the script datawave/ingest-examples/ingest-tv-shows.sh will allow you to query api.tvmaze.com for TV shows
of interest and ingest the resulting JSON data on demand.
-->
<configuration>
<property>
<name>file.input.format</name>
<value>datawave.ingest.json.mr.input.JsonInputFormat</value>
</property>
<property>
<name>data.name</name>
<value>myjson</value>
<description>This is the name of the datatype, which distinguishes it from other types for the purposes of ingest
processing and perhaps even for dataflow/transport concerns. As such, this can be used to denote a subtype of
some common data format, like CSV files for example, which could originate from any number of sources
</description>
</property>
<property>
<name>myjson.output.name</name>
<value>tvmaze</value>
<description>This is the name to use on the data in Accumulo</description>
</property>
<property>
<name>myjson.ingest.helper.class</name>
<value>datawave.ingest.json.config.helper.JsonIngestHelper</value>
</property>
<property>
<name>myjson.reader.class</name>
<value>datawave.ingest.json.mr.input.JsonRecordReader</value>
</property>
<property>
<name>myjson.handler.classes</name>
<value>datawave.ingest.json.mr.handler.ContentJsonColumnBasedHandler</value>
<description>List of classes that should process each event</description>
</property>
<property>
<name>myjson.data.category.uuid.fields</name>
<value>ID,EXTERNALS_THETVDB,EXTERNALS_TVRAGE,EXTERNALS_IMDB</value>
<description>List of known fields that contain UUIDs</description>
</property>
<property>
<name>myjson.data.separator</name>
<value>,</value>
<description>This is the separator to use for delimited text, and between configuration file parameters with multiple values.
</description>
</property>
<property>
<name>myjson.data.header</name>
<value>ID,NAME,PREMIERED,RUNTIME,STATUS,SUMMARY,OFFICIALSITE,LANGUAGE,GENRES,WEIGHT,URL,TYPE</value>
<description>Known metadata fields that may be expected to appear in every json document. Often, these may be
"required" fields, and/or fields that you want to use for policy enforcement, quality assurance, etc</description>
</property>
<property>
<name>myjson.data.process.extra.fields</name>
<value>true</value>
<description>If true, "extra" fields within the json tree (ie, those outside the defined "header") should be
processed. Otherwise, everything outside the header will be ignored unless explicitly whitelisted</description>
</property>
<property>
<name>myjson.data.json.flattener.mode</name>
<value>GROUPED_AND_NORMAL</value>
<description>The classes datawave.ingest.json.mr.input.JsonRecordReader and
datawave.ingest.json.config.helper.JsonIngestHelper support 4 different json-flattening modes:
(1) 'SIMPLE' mode:
Ignores nested objects. Only retrieves root-level primitives, including primitives within root-level arrays.
Array primitives will be represented as multi-valued keys in the resulting map. If your json is flat already,
or if you wish to ignore nested objects for whatever reason, this mode should suffice. If you require
complete tree traversal, see NORMAL and GROUPED modes.
PROs:
A good choice if all of your incoming json is flat. Least amount overhead in terms of json parsing
CONs:
A bad choice if you need to ingest json schemas where items of interest are nested
(2) 'NORMAL' mode:
Traverses the entire Json tree, unlike SIMPLE mode. Given a nested 'fieldname' property @ level 4 in the
tree and path delimiter value of '_', the flattened result would take the form:
GREATGRANDPARENT_GRANDPARENT_PARENT_FIELDNAME = Value
At query time, the full identifier is used as the search term
PROs:
The field names users will employ in queries match the structure of the underlying raw data, so
DataWave's data element dictionary tends to be self-explanatory and intuitive for end users
CONs:
Can lead to very long field names, especially for deeply nested json schemas. Also, if users will
need the ability to easily disambiguate json elements at query time based their original position within
the json tree, then 'NORMAL' mode will be problematic, since no context information is retained
(3) 'GROUPED' mode:
Full parse as with NORMAL mode, but instead we append the hierarchical context onto 'FIELDNAME' as a
dot-delimited suffix, with additional information to identify the ordinal position or occurrence at each
level...
E.g., FIELDNAME.greatgrandparent_0.grandparent_1.parent_3.fieldname_0 = Value
...where '_#' denotes the specific occurrence of the element within the given level of the hierarchy.
At query time, only the 'FIELDNAME' identifier is used as the search term. If FIELDNAME happens to be
a repeated field within the JSON document and users will need the ability to disambiguate them at query
time, DataWave provides filtering functions to support that...
See datawave-quickstart/bin/services/datawave/test-web/tests/EventQueryJsonGrouped.test for examples
PROs:
Allows users to query with the shortest, ie most compact, field names possible. Also tends to
keep the overall size in bytes of the data dictionary and forward/reverse indices to a minimum,
since only the 'FIELDNAME' portion of the key is stored in those places; only the 'shard' table
retains the full context which is used only for filtering purposes as needed
CONs:
Since the data dictionary only knows about the 'FIELDNAME' portion of the names, users may lose
sight of the underlying structure and semantics of the raw data for query purposes. That may lead
to confusion, particularly if the repository will store data from mutliple distinct sources and those
sources differ in format/structure yet have 'FIELDNAME' elements in common.
(4) 'GROUPED_AND_NORMAL' mode:
Retains the fully-qualified 'NORMAL' mode field name as the prefix, and appends the full context suffix
as defined by 'GROUPED' mode above, for maximum flexibility
E.g., GRANDPARENT_PARENT_FIELDNAME.grandparent_0.parent_2.fieldname_0
PROs:
As with 'NORMAL' mode, the data dictionary elements and index entries will reflect the structure of
the incoming json, which may be more intuitive for users, alleviating confusion around field name
semantics, etc. Also allows users to easily disambiguate values from different parts of the json's
hierarchical structure, if needed
CONs:
Requires the most storage in bytes for fieldname storage in the shard table, since the path information
is replicated in the suffix along with the ordinal context
</description>
</property>
<!--<property>-->
<!--<name>myjson.data.category.marking.visibility.field</name>-->
<!--<value>DOCUMENT_VISIBILITY</value>-->
<!--<description>Known field in every record that will contain the event's ColumnVisibility for Accumulo.-->
<!--If the raw data doesn't convey security markings, then utilize the '.data.category.marking.default' property-->
<!--instead, to declare the default marking to be applied to every field</description>-->
<!--</property>-->
<property>
<name>myjson.data.category.marking.default</name>
<value>PRIVATE|(BAR&FOO)</value>
<description>ColumnVisibility expression to be applied to each field, when the raw data is known to provide none</description>
</property>
<property>
<name>myjson.SUMMARY.data.field.marking</name>
<value>PUBLIC</value>
<description>ColumnVisibility expression to be applied to the "SUMMARY" field</description>
</property>
<!--<property>-->
<!--<name>myjson.data.category.date</name>-->
<!--<value>PREMIERED</value>-->
<!--<description>Known date field to be used, if found, for the shard row id. Otherwise, current date will be used</description>-->
<!--</property>-->
<property>
<name>myjson.data.category.date.formats</name>
<value>yyyy-MM-dd,yyyy-MM-dd'T'HH:mm:ss'Z',yyyy-MM-dd HH:mm:ss</value>
<description>Known/valid date formats for *.data.category.date field</description>
</property>
<!-- Indexing and tokenization-->
<property>
<name>myjson.data.category.index</name>
<value>NAME,ID,ID,EXTERNALS_THETVDB,EXTERNALS_TVRAGE,EXTERNALS_IMDB,EMBEDDED_CAST_CHARACTER_NAME,EMBEDDED_CAST_PERSON_NAME,EMBEDDED_CAST_PERSON_ID,GENRES,NETWORK_NAME,OFFICIALSITE,TYPE,STATUS,RUNTIME,URL</value>
<description>List of known fields to index</description>
</property>
<property>
<name>myjson.data.category.index.reverse</name>
<value>NAME,NETWORK_NAME,OFFICIALSITE,URL</value>
<description>List of known fields to reverse index</description>
</property>
<property>
<name>myjson.data.category.token.fieldname.designator</name>
<value>_TOKEN</value>
<description>Field name suffix to be applied to field names that are tokenized.
See *.data.category.index.tokenize.whitelist</description>
</property>
<property>
<name>myjson.data.category.index.tokenize.whitelist</name>
<value>SUMMARY,NETWORK_NAME,NAME,EMBEDDED_CAST_CHARACTER_NAME,EMBEDDED_CAST_PERSON_NAME</value>
<description>These are the fields to tokenize and index.</description>
</property>
<property>
<name>myjson.data.category.index.only</name>
<value>SUMMARY_TOKEN,NETWORK_NAME_TOKEN,NAME_TOKEN,EMBEDDED_CAST_CHARACTER_NAME_TOKEN,EMBEDDED_CAST_PERSON_NAME_TOKEN</value>
<description>Fields that will exist only in the global index. Will not be stored as part of the event/document</description>
</property>
<!-- Field Normalization -->
<property>
<name>myjson.data.default.normalization.failure.policy</name>
<value>FAIL</value>
<description>For field normalization failures: DROP, LEAVE, FAIL.
FAIL: the entire event/document will be dropped, and possibly written to the error schema in Accumulo.
LEAVE: the non-normalized value will be kept as-is.
DROP: the failed field will be dropped, and everything else retained
</description>
</property>
<property>
<name>myjson.data.default.type.class</name>
<value>datawave.data.type.LcNoDiacriticsType</value>
<description>Default type</description>
</property>
<property>
<name>myjson.PREMIERED.data.field.type.class</name>
<value>datawave.data.type.DateType</value>
</property>
<property>
<name>myjson.WEIGHT.data.field.type.class</name>
<value>datawave.data.type.NumberType</value>
</property>
<property>
<name>myjson.RUNTIME.data.field.type.class</name>
<value>datawave.data.type.NumberType</value>
</property>
</configuration>