Skip to content

Commit 7d89a5e

Browse files
committed
TIKA-431: Tika currently misuses the HTTP Content-Encoding header, and does not seem to use the charset part of the Content-Type header properly.
Make text and html parsers return character encoding as a charset parameter in the content type metadata field git-svn-id: https://svn.apache.org/repos/asf/tika/trunk@1358858 13f79535-47bb-0310-9956-ffa450edef68
1 parent 95a1cf9 commit 7d89a5e

File tree

18 files changed

+325
-132
lines changed

18 files changed

+325
-132
lines changed

.gitattributes

+2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
tika-parsers/src/test/resources/test-documents/testARofText.ar eol=lf
22
tika-parsers/src/test/resources/test-documents/testEMLX.emlx eol=lf
3+
tika-parsers/src/test/resources/test-documents/testTXT.txt eol=lf
4+
tika-parsers/src/test/resources/test-documents/testHTML.html eol=lf

CHANGES.txt

+9
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,15 @@ Release 1.2 - Current Development
4343
ICU4J algorithms are still used as a fallback thanks to their wider
4444
coverage of custom character encodings. (TIKA-322, TIKA-471)
4545

46+
* Charset parameter: Related to the character encoding improvements
47+
mentioned above, Tika now returns the detected character encoding as
48+
a "charset" parameter of the content type metadata field for text/plain
49+
and text/html documents. For example, instead of just "text/plain", the
50+
returned content type will be something like "text/plain; charset=UTF-8"
51+
for a UTF-8 encoded text document. Character encoding information is still
52+
present also in the content encoding metadata field for backwards
53+
compatibility, but that field should be considered deprecated. (TIKA-431)
54+
4655
* Extraction of embedded resources from OLE2 Office Documents, where
4756
the resource isn't another office document, has been fixed (TIKA-948)
4857

tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java

-2
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,8 @@
2626
import java.util.Collections;
2727
import java.util.Enumeration;
2828
import java.util.HashMap;
29-
import java.util.HashSet;
3029
import java.util.List;
3130
import java.util.Map;
32-
import java.util.Set;
3331
import java.util.regex.Pattern;
3432

3533
/**

tika-core/src/main/java/org/apache/tika/detect/TextDetector.java

+11-23
Original file line numberDiff line numberDiff line change
@@ -116,30 +116,18 @@ public MediaType detect(InputStream input, Metadata metadata)
116116

117117
input.mark(bytesToTest);
118118
try {
119-
int chars = 0;
120-
int controls = 0;
121-
int asciis = 0;
122-
int ch = input.read();
123-
while (ch != -1 && chars < bytesToTest) {
124-
if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) {
125-
controls++;
126-
} else if (ch < 127) {
127-
asciis++;
128-
}
129-
ch = input.read();
130-
chars++;
119+
TextStatistics stats = new TextStatistics();
120+
121+
byte[] buffer = new byte[1024];
122+
int n = 0;
123+
int m = input.read(buffer, 0, Math.min(bytesToTest, buffer.length));
124+
while (m != -1 && n < bytesToTest) {
125+
stats.addData(buffer, 0, m);
126+
n += m;
127+
m = input.read(buffer, 0, Math.min(bytesToTest - n, buffer.length));
131128
}
132-
if (chars == 0) {
133-
// Empty document, so treat it as binary
134-
// See https://issues.apache.org/jira/browse/TIKA-483
135-
return MediaType.OCTET_STREAM;
136-
} else if (controls == 0) {
137-
// No control characters, so treat it as text
138-
return MediaType.TEXT_PLAIN;
139-
} else if (controls < chars * 2 / 100
140-
&& asciis > chars * 90 / 100) {
141-
// Almost plain text (< 2% control, > 90% ASCII range)
142-
// See https://issues.apache.org/jira/browse/TIKA-688
129+
130+
if (stats.isMostlyAscii()) {
143131
return MediaType.TEXT_PLAIN;
144132
} else {
145133
return MediaType.OCTET_STREAM;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.tika.detect;
18+
19+
/**
20+
* Utility class for computing a histogram of the bytes seen in a stream.
21+
*
22+
* @since Apache Tika 1.2
23+
*/
24+
public class TextStatistics {
25+
26+
private final int[] counts = new int[256];
27+
28+
private int total = 0;
29+
30+
public void addData(byte[] buffer, int offset, int length) {
31+
for (int i = 0; i < length; i++) {
32+
counts[buffer[offset + i] & 0xff]++;
33+
total++;
34+
}
35+
}
36+
37+
/**
38+
* Checks whether at least one byte was seen and that the bytes that
39+
* were seen were mostly plain text (i.e. < 2% control, > 90% ASCII range).
40+
*
41+
* @see <a href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
42+
* @see <a href="https://issues.apache.org/jira/browse/TIKA-688">TIKA-688</a>
43+
* @return <code>true</code> if the seen bytes were mostly safe ASCII,
44+
* <code>false</code> otherwise
45+
*/
46+
public boolean isMostlyAscii() {
47+
int control = count(0, 0x20);
48+
int ascii = count(0x20, 128);
49+
int safe = countSafeControl();
50+
return total > 0
51+
&& (control - safe) * 100 < total * 2
52+
&& (ascii + safe) * 100 > total * 90;
53+
}
54+
55+
/**
56+
* Returns the total number of bytes seen so far.
57+
*
58+
* @return count of all bytes
59+
*/
60+
public int count() {
61+
return total;
62+
}
63+
64+
/**
65+
* Returns the number of occurrences of the given byte.
66+
*
67+
* @param b byte
68+
* @return count of the given byte
69+
*/
70+
public int count(int b) {
71+
return counts[b & 0xff];
72+
}
73+
74+
/**
75+
* Counts control characters (i.e. < 0x20, excluding tab, CR, LF,
76+
* page feed and escape).
77+
* <p>
78+
* This definition of control characters is based on section 4 of the
79+
* "Content-Type Processing Model" Internet-draft
80+
* (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
81+
* >draft-abarth-mime-sniff-01</a>).
82+
* <pre>
83+
* +-------------------------+
84+
* | Binary data byte ranges |
85+
* +-------------------------+
86+
* | 0x00 -- 0x08 |
87+
* | 0x0B |
88+
* | 0x0E -- 0x1A |
89+
* | 0x1C -- 0x1F |
90+
* +-------------------------+
91+
* </pre>
92+
*
93+
* @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
94+
* @return count of control characters
95+
*/
96+
public int countControl() {
97+
return count(0, 0x20) - countSafeControl();
98+
}
99+
100+
/**
101+
* Counts "safe" (i.e. seven-bit non-control) ASCII characters.
102+
*
103+
* @see #countControl()
104+
* @return count of safe ASCII characters
105+
*/
106+
public int countSafeAscii() {
107+
return count(0x20, 128) + countSafeControl();
108+
}
109+
110+
/**
111+
* Counts eight bit characters, i.e. bytes with their highest bit set.
112+
*
113+
* @return count of eight bit characters
114+
*/
115+
public int countEightBit() {
116+
return count(128, 256);
117+
}
118+
119+
private int count(int from, int to) {
120+
assert 0 <= from && to < counts.length;
121+
int count = 0;
122+
for (int i = from; i < to; i++) {
123+
count += counts[i];
124+
}
125+
return count;
126+
}
127+
128+
private int countSafeControl() {
129+
return count('\t') + count('\n') + count('\r') // tab, LF, CR
130+
+ count(0x0c) + count(0x1b); // new page, escape
131+
}
132+
133+
}

tika-core/src/main/java/org/apache/tika/mime/MediaType.java

+25
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package org.apache.tika.mime;
1818

1919
import java.io.Serializable;
20+
import java.nio.charset.Charset;
2021
import java.util.Collections;
2122
import java.util.HashMap;
2223
import java.util.HashSet;
@@ -72,6 +73,8 @@ public final class MediaType implements Comparable<MediaType>, Serializable {
7273

7374
public static final MediaType TEXT_PLAIN = parse("text/plain");
7475

76+
public static final MediaType TEXT_HTML = parse("text/html");
77+
7578
public static final MediaType APPLICATION_XML = parse("application/xml");
7679

7780
public static final MediaType APPLICATION_ZIP = parse("application/zip");
@@ -345,6 +348,28 @@ public MediaType(MediaType type, Map<String, String> parameters) {
345348
union(type.parameters, parameters));
346349
}
347350

351+
/**
352+
* Creates a media type by adding a parameter to a base type.
353+
*
354+
* @param type base type
355+
* @param name parameter name
356+
* @param value parameter value
357+
* @since Apache Tika 1.2
358+
*/
359+
public MediaType(MediaType type, String name, String value) {
360+
this(type, Collections.singletonMap(name, value));
361+
}
362+
363+
/**
364+
* Creates a media type by adding the "charset" parameter to a base type.
365+
*
366+
* @param type base type
367+
* @param charset charset value
368+
* @since Apache Tika 1.2
369+
*/
370+
public MediaType(MediaType type, Charset charset) {
371+
this(type, "charset", charset.name());
372+
}
348373
/**
349374
* Returns the base form of the MediaType, excluding
350375
* any parameters, such as "text/plain" for

tika-core/src/main/java/org/apache/tika/mime/package-info.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,5 @@
1818
/**
1919
* Media type information.
2020
*/
21-
@aQute.bnd.annotation.Version("1.0.0")
21+
@aQute.bnd.annotation.Version("1.2.0")
2222
package org.apache.tika.mime;

tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -51,16 +51,16 @@ public void testDetectEmpty() throws Exception {
5151
public void testDetectText() throws Exception {
5252
assertText("Hello, World!".getBytes("UTF-8"));
5353
assertText(" \t\r\n".getBytes("UTF-8"));
54-
assertText(new byte[] { -1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B });
54+
assertNotText(new byte[] { -1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B });
5555
assertNotText(new byte[] { 0 });
5656
assertNotText(new byte[] { 'H', 'e', 'l', 'l', 'o', 0 });
5757

5858
byte[] data = new byte[512];
5959
Arrays.fill(data, (byte) '.');
6060
assertText(data);
61-
Arrays.fill(data, 100, 109, (byte) 0x1f);
62-
assertText(data); // almost text
6361
Arrays.fill(data, 100, 110, (byte) 0x1f);
62+
assertText(data); // almost text
63+
Arrays.fill(data, 100, 111, (byte) 0x1f);
6464
assertNotText(data); // no longer almost text, too many control chars
6565
Arrays.fill(data, (byte) 0x1f);
6666
assertNotText(data);

tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -67,13 +67,13 @@ public void testDetection() throws Exception {
6767

6868
public void testByteOrderMark() throws Exception {
6969
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
70-
new ByteArrayInputStream("\ufffetest".getBytes("UTF-16LE")),
70+
new ByteArrayInputStream("\ufefftest".getBytes("UTF-16LE")),
7171
new Metadata()));
7272
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
73-
new ByteArrayInputStream("\ufffetest".getBytes("UTF-16BE")),
73+
new ByteArrayInputStream("\ufefftest".getBytes("UTF-16BE")),
7474
new Metadata()));
7575
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
76-
new ByteArrayInputStream("\ufffetest".getBytes("UTF-8")),
76+
new ByteArrayInputStream("\ufefftest".getBytes("UTF-8")),
7777
new Metadata()));
7878
}
7979

tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

+9-5
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import java.io.IOException;
2020
import java.io.InputStream;
21+
import java.nio.charset.Charset;
2122
import java.util.Arrays;
2223
import java.util.Collections;
2324
import java.util.HashSet;
@@ -57,7 +58,7 @@ public class HtmlParser extends AbstractParser {
5758
new ServiceLoader(HtmlParser.class.getClassLoader());
5859

5960
/**
60-
* HTML schema singleton used to amortize the heavy instantiation time.
61+
* HTML schema singleton used to amortise the heavy instantiation time.
6162
*/
6263
private static final Schema HTML_SCHEMA = new HTMLSchema();
6364

@@ -73,11 +74,14 @@ public void parse(
7374
AutoDetectReader reader = new AutoDetectReader(
7475
new CloseShieldInputStream(stream), metadata, LOADER);
7576
try {
76-
if (metadata.get(Metadata.CONTENT_TYPE) == null) {
77-
// TODO: Include charset
78-
metadata.set(Metadata.CONTENT_TYPE, "text/html");
77+
Charset charset = reader.getCharset();
78+
String previous = metadata.get(Metadata.CONTENT_TYPE);
79+
if (previous == null || previous.startsWith("text/html")) {
80+
MediaType type = new MediaType(MediaType.TEXT_HTML, charset);
81+
metadata.set(Metadata.CONTENT_TYPE, type.toString());
7982
}
80-
metadata.set(Metadata.CONTENT_ENCODING, reader.getCharset().name());
83+
// deprecated, see TIKA-431
84+
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
8185

8286
// Get the HTML mapper from the parse context
8387
HtmlMapper mapper =

tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java

+5-11
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,7 @@
2222
import java.io.InputStream;
2323
import java.nio.channels.FileChannel;
2424
import java.util.Collections;
25-
import java.util.HashMap;
2625
import java.util.HashSet;
27-
import java.util.Map;
2826
import java.util.Set;
2927
import java.util.regex.Pattern;
3028

@@ -76,10 +74,12 @@ public class POIFSContainerDetector implements Detector {
7674
public static final MediaType GENERAL_EMBEDDED = application("x-tika-msoffice-embedded");
7775

7876
/** An OLE10 Native embedded document within another OLE2 document */
79-
public static final MediaType OLE10_NATIVE = new MediaType(GENERAL_EMBEDDED, format("ole10_native"));
77+
public static final MediaType OLE10_NATIVE =
78+
new MediaType(GENERAL_EMBEDDED, "format", "ole10_native");
8079

8180
/** Some other kind of embedded document, in a CompObj container within another OLE2 document */
82-
public static final MediaType COMP_OBJ = new MediaType(GENERAL_EMBEDDED, format("comp_obj"));
81+
public static final MediaType COMP_OBJ =
82+
new MediaType(GENERAL_EMBEDDED, "format", "comp_obj");
8383

8484
/** Microsoft Excel */
8585
public static final MediaType XLS = application("vnd.ms-excel");
@@ -122,13 +122,7 @@ public class POIFSContainerDetector implements Detector {
122122

123123
/** Regexp for matching the MPP Project Data stream */
124124
private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+");
125-
126-
private static Map<String,String> format(String format) {
127-
Map<String, String> params = new HashMap<String, String>();
128-
params.put("format", format);
129-
return params;
130-
}
131-
125+
132126
public MediaType detect(InputStream input, Metadata metadata)
133127
throws IOException {
134128
// Check if we have access to the document

0 commit comments

Comments
 (0)