Skip to content

Commit 34db935

Browse files
committed
TIKA-1918: make outputSuffix optional in tika-batch
1 parent 9ebf066 commit 34db935

File tree

12 files changed

+222
-46
lines changed

12 files changed

+222
-46
lines changed

tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java

-7
Original file line numberDiff line numberDiff line change
@@ -160,35 +160,28 @@ private static void translateCommandLine(String[] args, Map<String, String> map)
160160
map.remove("-h");
161161
map.remove("--html");
162162
map.put("-basicHandlerType", "html");
163-
map.put("-outputSuffix", "html");
164163
} else if (map.containsKey("-x") || map.containsKey("--xml")) {
165164
map.remove("-x");
166165
map.remove("--xml");
167166
map.put("-basicHandlerType", "xml");
168-
map.put("-outputSuffix", "xml");
169167
} else if (map.containsKey("-t") || map.containsKey("--text")) {
170168
map.remove("-t");
171169
map.remove("--text");
172170
map.put("-basicHandlerType", "text");
173-
map.put("-outputSuffix", "txt");
174171
} else if (map.containsKey("-m") || map.containsKey("--metadata")) {
175172
map.remove("-m");
176173
map.remove("--metadata");
177174
map.put("-basicHandlerType", "ignore");
178-
map.put("-outputSuffix", "json");
179175
} else if (map.containsKey("-T") || map.containsKey("--text-main")) {
180176
map.remove("-T");
181177
map.remove("--text-main");
182178
map.put("-basicHandlerType", "body");
183-
map.put("-outputSuffix", "txt");
184179
}
185180

186181
if (map.containsKey("-J") || map.containsKey("--jsonRecursive")) {
187182
map.remove("-J");
188183
map.remove("--jsonRecursive");
189184
map.put("-recursiveParserWrapper", "true");
190-
//overwrite outputSuffix
191-
map.put("-outputSuffix", "json");
192185
}
193186

194187
if (map.containsKey("--inputDir") || map.containsKey("-i")) {

tika-app/src/main/resources/tika-app-batch-config.xml

+7-3
Original file line numberDiff line numberDiff line change
@@ -124,9 +124,13 @@
124124
digest="md5" digestMarkLimit="1000000"/>
125125
<contenthandler builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
126126
basicHandlerType="xml" writeLimit="-1"/>
127-
<!-- overwritePolicy: "skip" a file if output file exists, "rename" a output file, "overwrite" -->
128-
<!-- can include e.g. outputDir="output", but we don't want to include this in the default! -->
129-
<outputstream class="FSOutputStreamFactory" encoding="UTF-8" outputSuffix="xml"/>
127+
<!-- can specify custom output file suffix with:
128+
suffix=".mysuffix"
129+
if no suffix is specified, BasicTikaFSConsumersBuilder does its best to guess -->
130+
<!-- can specify compression with
131+
compression="bzip2|gzip|zip" -->
132+
133+
<outputstream class="FSOutputStreamFactory" encoding="UTF-8"/>
130134
</consumers>
131135

132136
<!-- reporter and interrupter are optional -->

tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java

-1
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,6 @@ public void testBasicMappingOfArgs() throws Exception {
113113
Map<String, String> attrs = mapify(commandLine);
114114
assertEquals("true", attrs.get("-recursiveParserWrapper"));
115115
assertEquals("html", attrs.get("-basicHandlerType"));
116-
assertEquals("json", attrs.get("-outputSuffix"));
117116
assertEquals("batch-config.xml", attrs.get("-bc"));
118117
assertEquals(testInputPathForCommandLine, attrs.get("-inputDir"));
119118
}

tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java

+48-3
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import org.apache.tika.batch.fs.FSUtil;
4242
import org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer;
4343
import org.apache.tika.config.TikaConfig;
44+
import org.apache.tika.sax.BasicContentHandlerFactory;
4445
import org.apache.tika.sax.ContentHandlerFactory;
4546
import org.apache.tika.util.ClassLoaderUtil;
4647
import org.apache.tika.util.PropsUtil;
@@ -125,7 +126,9 @@ public ConsumersManager build(Node node, Map<String, String> runtimeAttributes,
125126
}
126127
ContentHandlerFactory contentHandlerFactory = getContentHandlerFactory(contentHandlerFactoryNode, runtimeAttributes);
127128
ParserFactory parserFactory = getParserFactory(parserFactoryNode, runtimeAttributes);
128-
OutputStreamFactory outputStreamFactory = getOutputStreamFactory(outputStreamFactoryNode, runtimeAttributes);
129+
OutputStreamFactory outputStreamFactory = getOutputStreamFactory(
130+
outputStreamFactoryNode, runtimeAttributes,
131+
contentHandlerFactory, recursiveParserWrapper);
129132

130133
if (recursiveParserWrapper) {
131134
for (int i = 0; i < numConsumers; i++) {
@@ -147,7 +150,6 @@ public ConsumersManager build(Node node, Map<String, String> runtimeAttributes,
147150
return manager;
148151
}
149152

150-
151153
private ContentHandlerFactory getContentHandlerFactory(Node node, Map<String, String> runtimeAttributes) {
152154

153155
Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
@@ -166,7 +168,10 @@ private ParserFactory getParserFactory(Node node, Map<String, String> runtimeAtt
166168
return builder.build(node, runtimeAttributes);
167169
}
168170

169-
private OutputStreamFactory getOutputStreamFactory(Node node, Map<String, String> runtimeAttributes) {
171+
private OutputStreamFactory getOutputStreamFactory(Node node,
172+
Map<String, String> runtimeAttributes,
173+
ContentHandlerFactory contentHandlerFactory,
174+
boolean useRecursiveParserWrapper) {
170175
Map<String, String> attrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
171176

172177
Path outputDir = PropsUtil.getPath(attrs.get("outputDir"), null);
@@ -196,6 +201,17 @@ private OutputStreamFactory getOutputStreamFactory(Node node, Map<String, String
196201
compression = FSOutputStreamFactory.COMPRESSION.ZIP;
197202
}
198203
String suffix = attrs.get("outputSuffix");
204+
//suffix should not start with "."
205+
if (suffix == null) {
206+
StringBuilder sb = new StringBuilder();
207+
if (useRecursiveParserWrapper) {
208+
sb.append("json");
209+
} else if (contentHandlerFactory instanceof BasicContentHandlerFactory) {
210+
appendSuffix(((BasicContentHandlerFactory) contentHandlerFactory).getType(), sb);
211+
}
212+
appendCompression(compression, sb);
213+
suffix = sb.toString();
214+
}
199215

200216
//TODO: possibly open up the different handle-existings in the future
201217
//but for now, lock it down to require skip. Too dangerous otherwise
@@ -204,4 +220,33 @@ private OutputStreamFactory getOutputStreamFactory(Node node, Map<String, String
204220
compression, suffix);
205221
}
206222

223+
private void appendCompression(FSOutputStreamFactory.COMPRESSION compression, StringBuilder sb) {
224+
switch (compression) {
225+
case NONE:
226+
break;
227+
case ZIP:
228+
sb.append(".zip");
229+
break;
230+
case BZIP2:
231+
sb.append(".bz2");
232+
break;
233+
case GZIP:
234+
sb.append(".gz");
235+
break;
236+
}
237+
}
238+
239+
private void appendSuffix(BasicContentHandlerFactory.HANDLER_TYPE type, StringBuilder sb) {
240+
switch (type) {
241+
case XML:
242+
sb.append("xml");
243+
break;
244+
case HTML:
245+
sb.append("html");
246+
break;
247+
default :
248+
sb.append("txt");
249+
}
250+
}
251+
207252
}

tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml

+27-23
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,13 @@
2626
<tika-batch-config
2727
maxAliveTimeSeconds="-1"
2828
pauseOnEarlyTerminationMillis="10000"
29-
timeoutThresholdMillis="300000"
30-
timeoutCheckPulseMillis="1000"
31-
maxQueueSize="10000"
32-
numConsumers="default"> <!-- numConsumers = number of file consumers, "default" = number of processors -1 -->
33-
34-
<!-- options to allow on the commandline -->
35-
<commandline>
29+
timeoutThresholdMillis="300000"
30+
timeoutCheckPulseMillis="1000"
31+
maxQueueSize="10000"
32+
numConsumers="default"> <!-- numConsumers = number of file consumers, "default" = number of processors -1 -->
33+
34+
<!-- options to allow on the commandline -->
35+
<commandline>
3636
<option opt="c" longOpt="tika-config" hasArg="true"
3737
description="TikaConfig file"/>
3838
<option opt="bc" longOpt="batch-config" hasArg="true"
@@ -72,14 +72,14 @@
7272
<option opt="timeoutThresholdMillis" hasArg="true"
7373
description="how long to wait before determining that a consumer is stale"/>
7474
<option opt="includeFilePat" hasArg="true"
75-
description="regex that specifies which files to process"/>
76-
<option opt="excludeFilePat" hasArg="true"
77-
description="regex that specifies which files to avoid processing"/>
78-
<option opt="reporterSleepMillis" hasArg="true"
79-
description="millisecond between reports by the reporter"/>
80-
</commandline>
81-
82-
75+
description="regex that specifies which files to process"/>
76+
<option opt="excludeFilePat" hasArg="true"
77+
description="regex that specifies which files to avoid processing"/>
78+
<option opt="reporterSleepMillis" hasArg="true"
79+
description="millisecond between reports by the reporter"/>
80+
</commandline>
81+
82+
8383
<!-- can specify inputDir="input", but the default config should not include this -->
8484
<!-- can also specify startDir="input/someDir" to specify which child directory
8585
to start processing -->
@@ -116,12 +116,16 @@
116116
parseRecursively="true"/>
117117
<contenthandler builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
118118
basicHandlerType="xml" writeLimit="-1"/>
119-
<!-- overwritePolicy: "skip" a file if output file exists, "rename" a output file, "overwrite" --> <!-- can include e.g. outputDir="output", but we don't want to include this in the default! -->
120-
<outputstream class="FSOutputStreamFactory" encoding="UTF-8" outputSuffix="xml"/>
121-
</consumers>
122-
123-
<!-- reporter and interrupter are optional -->
124-
<reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000"
125-
reporterStaleThresholdMillis="60000"/>
126-
<interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
119+
<!-- can specify custom output file suffix with:
120+
suffix=".mysuffix"
121+
if no suffix is specified, BasicTikaFSConsumersBuilder does its best to guess -->
122+
<!-- can specify compression with
123+
compression="bzip2|gzip|zip" -->
124+
<outputstream class="FSOutputStreamFactory" encoding="UTF-8"/>
125+
</consumers>
126+
127+
<!-- reporter and interrupter are optional -->
128+
<reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000"
129+
reporterStaleThresholdMillis="60000"/>
130+
<interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
127131
</tika-batch-config>

tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java

+17-2
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,6 @@ public void testHierarchicalWFileList() throws Exception {
276276
Paths.get(this.getClass().getResource("/testFileList.txt").toURI()).toString());
277277
args.put("recursiveParserWrapper", "true");
278278
args.put("basicHandlerType", "text");
279-
args.put("outputSuffix", "json");
280279
BatchProcessTestExecutor ex = new BatchProcessTestExecutor(args, "/tika-batch-config-MockConsumersBuilder.xml");
281280
ex.execute();
282281
Path test1 = outputDir.resolve("test1.xml.json");
@@ -302,7 +301,6 @@ public void testHandlingOfIllegalXMLCharsInException() throws Exception {
302301
args.put("numConsumers", "1");
303302
args.put("recursiveParserWrapper", "true");
304303
args.put("basicHandlerType", "text");
305-
args.put("outputSuffix", "json");
306304

307305
BatchProcessTestExecutor ex = new BatchProcessTestExecutor(args,
308306
"/tika-batch-config-MockConsumersBuilder.xml",
@@ -312,6 +310,23 @@ public void testHandlingOfIllegalXMLCharsInException() throws Exception {
312310
assertContains("parse_ex resourceId=\"test0_bad_chars.xml\"", ss.getOutString());
313311
}
314312

313+
@Test
314+
public void testOverrideOutputSuffix() throws Exception {
315+
Path outputDir = getNewOutputDir("outputSuffixTest");
316+
317+
Map<String, String> args = getDefaultArgs("basic", outputDir);
318+
args.put("numConsumers", "1");
319+
args.put("recursiveParserWrapper", "true");
320+
args.put("basicHandlerType", "text");
321+
322+
BatchProcessTestExecutor ex = new BatchProcessTestExecutor(args,
323+
"/tika-batch-config-test-suffix-override.xml",
324+
"/log4j-on.properties");
325+
ex.execute();
326+
Path targ = outputDir.resolve("test0.xml.mysuffix");
327+
assertTrue(Files.isRegularFile(targ));
328+
}
329+
315330
private class BatchProcessTestExecutor {
316331
private final Map<String, String> args;
317332
private final String configPath;

tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java

-4
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ public void testXML() throws Exception {
3636
Path outputDir = getNewOutputDir("handler-xml-");
3737
Map<String, String> args = getDefaultArgs("basic", outputDir);
3838
args.put("basicHandlerType", "xml");
39-
args.put("outputSuffix", "xml");
4039

4140
BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);
4241
ParallelFileProcessingResult result = run(runner);
@@ -54,7 +53,6 @@ public void testHTML() throws Exception {
5453

5554
Map<String, String> args = getDefaultArgs("basic", outputDir);
5655
args.put("basicHandlerType", "html");
57-
args.put("outputSuffix", "html");
5856
BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);
5957
ParallelFileProcessingResult result = run(runner);
6058
Path outputFile = outputDir.resolve("test0.xml.html");
@@ -70,7 +68,6 @@ public void testText() throws Exception {
7068

7169
Map<String, String> args = getDefaultArgs("basic", outputDir);
7270
args.put("basicHandlerType", "txt");
73-
args.put("outputSuffix", "txt");
7471

7572
BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);
7673
ParallelFileProcessingResult result = run(runner);
@@ -105,7 +102,6 @@ public void testRecursiveParserWrapper() throws Exception {
105102

106103
Map<String, String> args = getDefaultArgs("basic", outputDir);
107104
args.put("basicHandlerType", "txt");
108-
args.put("outputSuffix", "json");
109105
args.put("recursiveParserWrapper", "true");
110106

111107
BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);

tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@
103103
basicHandlerType="xml" writeLimit="-1"/>
104104

105105
<outputstream class="FSOutputStreamFactory"
106-
encoding="UTF-8" outputSuffix="xml"/>
106+
encoding="UTF-8"/>
107107
</consumers>
108108

109109
<!-- reporter and interrupter are optional -->

tika-batch/src/test/resources/tika-batch-config-broken.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@
9797
basicHandlerType="xml" writeLimit="-1"/>
9898

9999
<outputstream class="FSOutputStreamFactory"
100-
encoding="UTF-8" outputSuffix="xml"/>
100+
encoding="UTF-8"/>
101101
</consumers>
102102

103103
<!-- reporter and interrupter are optional -->

0 commit comments

Comments
 (0)