Skip to content

Commit 98eb56e

Browse files
committed
TIKA-1285 -- upgrade to PDFBox 2.0.0
1 parent 1924c3f commit 98eb56e

File tree

11 files changed

+162
-323
lines changed

11 files changed

+162
-323
lines changed

CHANGES.txt

+6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
Release 1.13 - ???
22

3+
* Upgrade to PDFBox 2.0.0 (TIKA-1285). MAJOR CHANGES in PDFParser:
4+
* The classic sequential parser is no longer available.
5+
* Tiff files are no longer extracted by default. See
6+
https://pdfbox.apache.org/2.0/dependencies.html#optional-components
7+
for optional components to process Tiff files.
8+
39
* Add XMPMM support to PDFParser and JpegParser via Jempbox (TIKA-1894).
410

511
* Move serialization of TikaConfig to tika-core and enable dumping

tika-bundle/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@
127127
tika-parsers;inline=true,
128128
commons-compress, xz, commons-codec, commons-csv,
129129
commons-io, commons-exec, junrar,
130-
pdfbox,fontbox,jempbox,bcmail-jdk15on,bcprov-jdk15on,bcpkix-jdk15on,
130+
pdfbox,pdfbox-tools,pdfbox-debugger,fontbox,jempbox,bcmail-jdk15on,bcprov-jdk15on,bcpkix-jdk15on,
131131
poi,poi-scratchpad,poi-ooxml,poi-ooxml-schemas,
132132
curvesapi,
133133
xmlbeans,

tika-parsers/pom.xml

+21-2
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@
4242
<tukaani.version>1.5</tukaani.version>
4343
<mime4j.version>0.7.2</mime4j.version>
4444
<vorbis.version>0.8</vorbis.version>
45-
<pdfbox.version>1.8.11</pdfbox.version>
45+
<pdfbox.version>2.0.0</pdfbox.version>
46+
<jempbox.version>1.8.11</jempbox.version>
4647
<netcdf-java.version>4.5.5</netcdf-java.version>
4748
<cxf.version>3.0.3</cxf.version>
4849
<sis.version>0.6</sis.version>
@@ -133,6 +134,16 @@
133134
<artifactId>pdfbox</artifactId>
134135
<version>${pdfbox.version}</version>
135136
</dependency>
137+
<dependency>
138+
<groupId>org.apache.pdfbox</groupId>
139+
<artifactId>pdfbox-tools</artifactId>
140+
<version>${pdfbox.version}</version>
141+
</dependency>
142+
<dependency>
143+
<groupId>org.apache.pdfbox</groupId>
144+
<artifactId>jempbox</artifactId>
145+
<version>${jempbox.version}</version>
146+
</dependency>
136147
<!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
137148
as optional, but we prefer to have them always to avoid
138149
problems with encrypted PDFs. -->
@@ -298,7 +309,15 @@
298309
<artifactId>slf4j-log4j12</artifactId>
299310
<scope>test</scope>
300311
</dependency>
301-
312+
<!-- Copied from PDFBox:
313+
For legal reasons (incompatible license), jai-imageio-core is to be used
314+
only in the tests and may not be distributed. See also LEGAL-195 -->
315+
<dependency>
316+
<groupId>com.github.jai-imageio</groupId>
317+
<artifactId>jai-imageio-core</artifactId>
318+
<version>1.3.1</version>
319+
<scope>test</scope>
320+
</dependency>
302321
<!-- edu.ucar dependencies -->
303322
<dependency>
304323
<groupId>edu.ucar</groupId>

tika-parsers/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java

+10-6
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,13 @@
1818

1919
import java.io.IOException;
2020
import java.io.InputStream;
21+
import java.util.ArrayList;
2122
import java.util.Collections;
2223
import java.util.List;
2324
import java.util.Set;
2425

2526
import org.apache.fontbox.afm.AFMParser;
26-
import org.apache.fontbox.afm.FontMetric;
27+
import org.apache.fontbox.afm.FontMetrics;
2728
import org.apache.tika.exception.TikaException;
2829
import org.apache.tika.metadata.Metadata;
2930
import org.apache.tika.metadata.Property;
@@ -67,16 +68,19 @@ public Set<MediaType> getSupportedTypes( ParseContext context ) {
6768
public void parse(InputStream stream, ContentHandler handler,
6869
Metadata metadata, ParseContext context)
6970
throws IOException, SAXException, TikaException {
70-
FontMetric fontMetrics;
71+
FontMetrics fontMetrics;
7172
AFMParser parser = new AFMParser( stream );
7273

7374
// Have FontBox process the file
74-
parser.parse();
75-
fontMetrics = parser.getResult();
75+
fontMetrics = parser.parse();
7676

7777
// Get the comments in the file to display in xhtml
78-
List<String> comments = fontMetrics.getComments();
79-
78+
List<String> unModifiableComments = fontMetrics.getComments();
79+
//have to copy because we modify list in extractCreationDate
80+
List<String> comments = new ArrayList<>();
81+
for (String comment : unModifiableComments) {
82+
comments.add(comment);
83+
}
8084
// Get the creation date
8185
extractCreationDate( metadata, comments );
8286

tika-parsers/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,9 @@ public void parse(
6464
TrueTypeFont font;
6565
TTFParser parser = new TTFParser();
6666
if (tis != null && tis.hasFile()) {
67-
font = parser.parseTTF(tis.getFile());
67+
font = parser.parse(tis.getFile());
6868
} else {
69-
font = parser.parseTTF(stream);
69+
font = parser.parse(stream);
7070
}
7171

7272
// Report the details of the font

0 commit comments

Comments
 (0)