Skip to content

Commit 71f8423

Browse files
committed
TIKA-1936 -- clean up parsers and tests that aren't cleaning up tmp files, with heavy refactoring of PDFParser tests.
1 parent f89a19f commit 71f8423

File tree

8 files changed

+207
-334
lines changed

8 files changed

+207
-334
lines changed

tika-core/src/test/java/org/apache/tika/TikaTest.java

+27-8
Original file line numberDiff line numberDiff line change
@@ -108,25 +108,40 @@ public XMLResult(String xml, Metadata metadata) {
108108
this.metadata = metadata;
109109
}
110110
}
111-
protected XMLResult getXML(String filePath, Parser parser) throws Exception {
112-
return getXML(filePath, parser, new Metadata());
111+
112+
protected XMLResult getXML(String filePath, Parser parser, ParseContext context) throws Exception {
113+
return getXML(getResourceAsStream("/test-documents/" + filePath), parser, new Metadata(), context);
113114
}
114115

115116
protected XMLResult getXML(String filePath, Parser parser, Metadata metadata) throws Exception {
116-
return getXML(getResourceAsStream("/test-documents/" + filePath), parser, metadata);
117+
return getXML(getResourceAsStream("/test-documents/" + filePath), parser, metadata, null);
118+
}
119+
120+
protected XMLResult getXML(String filePath, ParseContext parseContext) throws Exception {
121+
return getXML(filePath, new AutoDetectParser(), parseContext);
117122
}
118123

119124
protected XMLResult getXML(String filePath, Metadata metadata) throws Exception {
120-
return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), metadata);
125+
return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), metadata, null);
126+
}
127+
128+
protected XMLResult getXML(String filePath, Parser parser) throws Exception {
129+
return getXML(filePath, parser, new Metadata());
121130
}
122131

123132
protected XMLResult getXML(String filePath) throws Exception {
124-
return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata());
133+
return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata(), null);
125134
}
126135

127136
protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception {
128-
ParseContext context = new ParseContext();
129-
context.set(Parser.class, parser);
137+
return getXML(input, parser, metadata, null);
138+
}
139+
140+
protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata, ParseContext context) throws Exception {
141+
if (context == null) {
142+
context = new ParseContext();
143+
context.set(Parser.class, parser);
144+
}
130145

131146
try {
132147
ContentHandler handler = new ToXMLContentHandler();
@@ -138,11 +153,15 @@ protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata)
138153
}
139154

140155
protected List<Metadata> getRecursiveJson(String filePath) throws Exception {
156+
return getRecursiveJson(filePath, new ParseContext());
157+
}
158+
159+
protected List<Metadata> getRecursiveJson(String filePath, ParseContext context) throws Exception {
141160
Parser p = new AutoDetectParser();
142161
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p,
143162
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
144163
try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
145-
wrapper.parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
164+
wrapper.parse(is, new DefaultHandler(), new Metadata(), context);
146165
}
147166
return wrapper.getMetadata();
148167
}

tika-example/src/main/java/org/apache/tika/example/ParsingExample.java

+9-5
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828

2929
import org.apache.tika.Tika;
3030
import org.apache.tika.exception.TikaException;
31+
import org.apache.tika.io.TikaInputStream;
3132
import org.apache.tika.metadata.Metadata;
3233
import org.apache.tika.metadata.serialization.JsonMetadataList;
3334
import org.apache.tika.parser.AutoDetectParser;
@@ -203,13 +204,16 @@ public String serializedRecursiveParserWrapperExample() throws IOException,
203204
*/
204205
public List<Path> extractEmbeddedDocumentsExample(Path outputPath) throws IOException,
205206
SAXException, TikaException {
206-
InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx");
207207
ExtractEmbeddedFiles ex = new ExtractEmbeddedFiles();
208-
ex.extract(stream, outputPath);
209208
List<Path> ret = new ArrayList<>();
210-
try (DirectoryStream<Path> dirStream = Files.newDirectoryStream(outputPath)) {
211-
for (Path entry : dirStream) {
212-
ret.add(entry);
209+
try (TikaInputStream stream =
210+
TikaInputStream.get(
211+
ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx"))) {
212+
ex.extract(stream, outputPath);
213+
try (DirectoryStream<Path> dirStream = Files.newDirectoryStream(outputPath)) {
214+
for (Path entry : dirStream) {
215+
ret.add(entry);
216+
}
213217
}
214218
}
215219
return ret;

tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java

+17-10
Original file line numberDiff line numberDiff line change
@@ -17,29 +17,30 @@
1717
package org.apache.tika.parser.mat;
1818

1919
//JDK imports
20+
import static java.nio.charset.StandardCharsets.UTF_8;
21+
2022
import java.io.IOException;
2123
import java.io.InputStream;
2224
import java.util.Collections;
23-
import java.util.Set;
2425
import java.util.Map;
26+
import java.util.Set;
2527

28+
import com.jmatio.io.MatFileHeader;
29+
import com.jmatio.io.MatFileReader;
30+
import com.jmatio.types.MLArray;
31+
import com.jmatio.types.MLStructure;
2632
import org.apache.tika.exception.TikaException;
33+
import org.apache.tika.io.TemporaryResources;
2734
import org.apache.tika.io.TikaInputStream;
2835
import org.apache.tika.metadata.Metadata;
36+
import org.apache.tika.mime.MediaType;
2937
import org.apache.tika.parser.AbstractParser;
3038
import org.apache.tika.parser.ParseContext;
31-
import org.apache.tika.mime.MediaType;
3239
import org.apache.tika.sax.XHTMLContentHandler;
3340
import org.xml.sax.ContentHandler;
3441
import org.xml.sax.SAXException;
3542

3643
//JMatIO imports
37-
import com.jmatio.io.MatFileHeader;
38-
import com.jmatio.io.MatFileReader;
39-
import com.jmatio.types.MLArray;
40-
import com.jmatio.types.MLStructure;
41-
42-
import static java.nio.charset.StandardCharsets.UTF_8;
4344

4445

4546
public class MatParser extends AbstractParser {
@@ -59,10 +60,12 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
5960

6061
//Set MIME type as Matlab
6162
metadata.set(Metadata.CONTENT_TYPE, MATLAB_MIME_TYPE);
62-
63+
TemporaryResources tmp =
64+
TikaInputStream.isTikaInputStream(stream) ? null :
65+
new TemporaryResources();
6366
try {
6467
// Use TIS so we can spool a temp file for parsing.
65-
TikaInputStream tis = TikaInputStream.get(stream);
68+
TikaInputStream tis = TikaInputStream.get(stream, tmp);
6669

6770
//Extract information from header file
6871
MatFileReader mfr = new MatFileReader(tis.getFile()); //input .mat file
@@ -128,6 +131,10 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
128131
xhtml.endDocument();
129132
} catch (IOException e) {
130133
throw new TikaException("Error parsing Matlab file with MatParser", e);
134+
} finally {
135+
if (tmp != null) {
136+
tmp.dispose();
137+
}
131138
}
132139
}
133140
}

tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java

+15-7
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,7 @@ public MetadataExtractor getMetadataExtractor() {
9898
}
9999

100100
/**
101-
* @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler,
102-
* org.apache.tika.metadata.Metadata)
101+
* @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(ContentHandler, Metadata, ParseContext)
103102
*/
104103
public void getXHTML(
105104
ContentHandler handler, Metadata metadata, ParseContext context)
@@ -220,9 +219,9 @@ private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String
220219

221220
// Open the POIFS (OLE2) structure and process
222221
POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream());
222+
TikaInputStream stream = null;
223223
try {
224224
Metadata metadata = new Metadata();
225-
TikaInputStream stream = null;
226225
metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
227226

228227
DirectoryNode root = fs.getRoot();
@@ -265,6 +264,13 @@ stream, new EmbeddedContentHandler(handler),
265264
// There was no CONTENTS entry, so skip this part
266265
} catch (Ole10NativeException e) {
267266
// Could not process an OLE 1.0 entry, so skip this part
267+
} finally {
268+
if (fs != null) {
269+
fs.close();
270+
}
271+
if (stream != null) {
272+
stream.close();
273+
}
268274
}
269275
}
270276

@@ -288,10 +294,12 @@ protected void handleEmbeddedFile(PackagePart part, ContentHandler handler, Stri
288294

289295
// Call the recursing handler
290296
if (embeddedExtractor.shouldParseEmbedded(metadata)) {
291-
embeddedExtractor.parseEmbedded(
292-
TikaInputStream.get(part.getInputStream()),
293-
new EmbeddedContentHandler(handler),
294-
metadata, false);
297+
try(TikaInputStream tis = TikaInputStream.get(part.getInputStream())) {
298+
embeddedExtractor.parseEmbedded(
299+
tis,
300+
new EmbeddedContentHandler(handler),
301+
metadata, false);
302+
}
295303
}
296304
}
297305

tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java

+7-1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import java.util.Set;
2626

2727
import org.apache.tika.exception.TikaException;
28+
import org.apache.tika.io.TemporaryResources;
2829
import org.apache.tika.io.TikaInputStream;
2930
import org.apache.tika.metadata.Metadata;
3031
import org.apache.tika.metadata.Property;
@@ -80,7 +81,9 @@ public void parse(InputStream stream, ContentHandler handler,
8081
Metadata metadata, ParseContext context) throws IOException,
8182
SAXException, TikaException {
8283

83-
TikaInputStream tis = TikaInputStream.get(stream);
84+
TemporaryResources tmp = TikaInputStream.isTikaInputStream(stream) ?
85+
null : new TemporaryResources();
86+
TikaInputStream tis = TikaInputStream.get(stream, tmp);
8487
NetcdfFile ncFile = null;
8588
try {
8689
ncFile = NetcdfFile.open(tis.getFile().getAbsolutePath());
@@ -134,6 +137,9 @@ public void parse(InputStream stream, ContentHandler handler,
134137
if (ncFile != null) {
135138
ncFile.close();
136139
}
140+
if (tmp != null) {
141+
tmp.dispose();
142+
}
137143
}
138144
}
139145

tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

+2-7
Original file line numberDiff line numberDiff line change
@@ -969,13 +969,8 @@ public void testMissingText() throws Exception {
969969
//TIKA-1100:
970970
@Test
971971
public void testExcelTextBox() throws Exception {
972-
Metadata metadata = new Metadata();
973-
ContentHandler handler = new BodyContentHandler();
974-
ParseContext context = new ParseContext();
975-
InputStream input = getTestDocument("testEXCEL_textbox.xlsx");
976-
parser.parse(input, handler, metadata, context);
977-
String content = handler.toString();
978-
assertContains("some autoshape", content);
972+
XMLResult r = getXML("testEXCEL_textbox.xlsx", parser);
973+
assertContains("some autoshape", r.xml);
979974
}
980975

981976
//TIKA-792; with room for future missing bean tests

0 commit comments

Comments
 (0)