Skip to content

Commit 95a1cf9

Browse files
committed
TIKA-906: Added basic support for AutoPageNumbers and their formats
git-svn-id: https://svn.apache.org/repos/asf/tika/trunk@1358856 13f79535-47bb-0310-9956-ffa450edef68
1 parent c6bcd32 commit 95a1cf9

8 files changed

+281
-3
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.tika.parser.iwork;
18+
19+
/**
20+
* Utility class to allow for conversion from an integer to Roman numerals
21+
* or alpha-numeric symbols in line with Pages auto numbering formats.
22+
*/
23+
class AutoPageNumberUtils {
24+
25+
private static final String ALPHABET[] = { "A", "B", "C", "D", "E", "F", "G",
26+
"H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
27+
"U", "V", "W", "X", "Y", "Z" };
28+
29+
private static final int MAX = 26;
30+
31+
public static String asAlphaNumeric(int i) {
32+
StringBuffer sbuff = new StringBuffer();
33+
int index = i % MAX;
34+
int ratio = i / MAX;
35+
36+
if (index == 0) {
37+
ratio--;
38+
index = MAX;
39+
}
40+
41+
for(int j = 0; j <= ratio; j++) {
42+
sbuff.append(ALPHABET[index - 1]); }
43+
return sbuff.toString();
44+
}
45+
46+
public static String asAlphaNumericLower(int i) {
47+
return asAlphaNumeric(i).toLowerCase();
48+
}
49+
50+
/*
51+
* Code copied from jena.apache.org.
52+
* @see com.hp.hpl.jena.sparql.util.RomanNumeral
53+
*/
54+
public static String asRomanNumerals(int i) {
55+
if ( i <= 0 )
56+
throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
57+
if ( i > 3999 )
58+
throw new NumberFormatException("Roman numerals are 1-3999 ("+i+")") ;
59+
StringBuffer sbuff = new StringBuffer() ;
60+
61+
i = i2r(sbuff, i, "M", 1000, "CM", 900, "D", 500, "CD", 400 ) ;
62+
i = i2r(sbuff, i, "C", 100, "XC", 90, "L", 50, "XL", 40 ) ;
63+
i = i2r(sbuff, i, "X", 10, "IX", 9, "V", 5, "IV", 4) ;
64+
65+
while ( i >= 1 )
66+
{
67+
sbuff.append("I") ;
68+
i -= 1 ;
69+
}
70+
return sbuff.toString() ;
71+
72+
73+
}
74+
75+
public static String asRomanNumeralsLower(int i) {
76+
return asRomanNumerals(i).toLowerCase();
77+
}
78+
79+
private static int i2r(StringBuffer sbuff, int i,
80+
String tens, int iTens,
81+
String nines, int iNines,
82+
String fives, int iFives,
83+
String fours, int iFours)
84+
{
85+
while ( i >= iTens )
86+
{
87+
sbuff.append(tens) ;
88+
i -= iTens ;
89+
}
90+
91+
if ( i >= iNines )
92+
{
93+
sbuff.append(nines) ;
94+
i -= iNines;
95+
}
96+
97+
if ( i >= iFives )
98+
{
99+
sbuff.append(fives) ;
100+
i -= iFives ;
101+
}
102+
if ( i >= iFours )
103+
{
104+
sbuff.append(fours) ;
105+
i -= iFours ;
106+
}
107+
return i ;
108+
}
109+
110+
}

tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java

+32
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import java.util.HashMap;
2929
import java.util.List;
3030
import java.util.Map;
31+
import java.util.regex.Pattern;
3132

3233
class PagesContentHandler extends DefaultHandler {
3334

@@ -43,6 +44,8 @@ private enum DocumentPart {
4344
}
4445
private DocumentPart inPart = null;
4546
private boolean ghostText;
47+
48+
private static String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
4649

4750
private boolean parseProperty = false;
4851
private int pageCount = 0;
@@ -132,6 +135,19 @@ public void startElement(
132135
inPart = headers.identifyPart(attributes.getValue("sf:name"));
133136
} else if ("sf:footer".equals(qName)) {
134137
inPart = footers.identifyPart(attributes.getValue("sf:name"));
138+
} else if ("sf:page-number".equals(qName)) {
139+
if (inPart == DocumentPart.FOOTER_ODD
140+
|| inPart == DocumentPart.FOOTER_FIRST
141+
|| inPart == DocumentPart.FOOTER_EVEN) {
142+
// We are in a footer
143+
footers.hasAutoPageNumber = true;
144+
footers.autoPageNumberFormat = attributes.getValue("sf:format");
145+
} else {
146+
headers.hasAutoPageNumber = true;
147+
headers.autoPageNumberFormat = attributes.getValue("sf:format");
148+
}
149+
150+
xhtml.characters(Integer.toString(this.pageCount));
135151
} else if ("sf:footnotes".equals(qName)) {
136152
footnotes = new Footnotes();
137153
inPart = DocumentPart.FOOTNOTES;
@@ -324,6 +340,8 @@ private class HeaderFooter {
324340
private String defaultOdd;
325341
private String defaultEven;
326342
private String defaultFirst;
343+
private boolean hasAutoPageNumber;
344+
private String autoPageNumberFormat;
327345
// TODO Can there be custom ones?
328346

329347
private HeaderFooter(String type) {
@@ -359,6 +377,19 @@ private void output(String what) throws SAXException {
359377
if (text != null) {
360378
xhtml.startElement("div", "class", "header");
361379
xhtml.characters(text);
380+
if (hasAutoPageNumber) {
381+
if (autoPageNumberFormat == null) { // raw number
382+
xhtml.characters("\t" + pageCount);
383+
} else if (autoPageNumberFormat.equals("upper-roman")){
384+
xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumerals(pageCount));
385+
} else if (autoPageNumberFormat.equals("lower-roman")){
386+
xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumeralsLower(pageCount));
387+
} else if (autoPageNumberFormat.equals("upper-alpha")){
388+
xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumeric(pageCount));
389+
} else if (autoPageNumberFormat.equals("lower-alpha")){
390+
xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumericLower(pageCount));
391+
}
392+
}
362393
xhtml.endElement("div");
363394
}
364395
}
@@ -414,4 +445,5 @@ private void end() {
414445
}
415446
}
416447
}
448+
417449
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
package org.apache.tika.parser.iwork;
2+
3+
import junit.framework.TestCase;
4+
5+
/**
6+
* Test class for the <code>AutoPageNumberUtils</code> helper class.
7+
*/
8+
public class AutoPageNumberUtilsTest extends TestCase {
9+
10+
/**
11+
* Check upper-case alpha-numeric numbers are generated based on the
12+
* input page number.
13+
*/
14+
public void testAlphaUpper() {
15+
assertEquals("A", AutoPageNumberUtils.asAlphaNumeric(1));
16+
assertEquals("Z", AutoPageNumberUtils.asAlphaNumeric(26));
17+
assertEquals("AA", AutoPageNumberUtils.asAlphaNumeric(27));
18+
assertEquals("ZZ", AutoPageNumberUtils.asAlphaNumeric(52));
19+
assertEquals("AAA", AutoPageNumberUtils.asAlphaNumeric(53));
20+
assertEquals("ZZZ", AutoPageNumberUtils.asAlphaNumeric(78));
21+
}
22+
23+
/**
24+
* Check lower-case alpha-numeric numbers are generated based on the
25+
* input page number.
26+
*/
27+
public void testAlphaLower() {
28+
assertEquals("a", AutoPageNumberUtils.asAlphaNumericLower(1));
29+
assertEquals("z", AutoPageNumberUtils.asAlphaNumericLower(26));
30+
assertEquals("aa", AutoPageNumberUtils.asAlphaNumericLower(27));
31+
assertEquals("zz", AutoPageNumberUtils.asAlphaNumericLower(52));
32+
assertEquals("aaa", AutoPageNumberUtils.asAlphaNumericLower(53));
33+
assertEquals("zzz", AutoPageNumberUtils.asAlphaNumericLower(78));
34+
}
35+
36+
/**
37+
* Check upper-case Roman numerals numbers are generated based on the
38+
* input page number.
39+
*/
40+
public void testRomanUpper() {
41+
assertEquals("I", AutoPageNumberUtils.asRomanNumerals(1));
42+
assertEquals("XXVI", AutoPageNumberUtils.asRomanNumerals(26));
43+
assertEquals("XXVII", AutoPageNumberUtils.asRomanNumerals(27));
44+
}
45+
46+
/**
47+
* Check lower-case Roman numerals numbers are generated based on the
48+
* input page number.
49+
*/
50+
public void testRomanLower() {
51+
assertEquals("i", AutoPageNumberUtils.asRomanNumeralsLower(1));
52+
assertEquals("xxvi", AutoPageNumberUtils.asRomanNumeralsLower(26));
53+
assertEquals("xxvii", AutoPageNumberUtils.asRomanNumeralsLower(27));
54+
}
55+
56+
}

tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java

+83-3
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import junit.framework.TestCase;
2424

2525
import org.apache.tika.metadata.Metadata;
26-
import org.apache.tika.metadata.Office;
2726
import org.apache.tika.metadata.TikaCoreProperties;
2827
import org.apache.tika.parser.AutoDetectParser;
2928
import org.apache.tika.parser.ParseContext;
@@ -291,7 +290,8 @@ public void testParsePagesPasswordProtected() throws Exception {
291290
public void testParsePagesHeadersFootersFootnotes() throws Exception {
292291
String footnote = "Footnote: Do a lot of people really use iWork?!?!";
293292
String header = "THIS IS SOME HEADER TEXT";
294-
String footer = "THIS IS SOME FOOTER TEXT";
293+
String footer = "THIS IS SOME FOOTER TEXT\t1";
294+
String footer2 = "THIS IS SOME FOOTER TEXT\t2";
295295

296296
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersFootnotes.pages");
297297
Metadata metadata = new Metadata();
@@ -308,17 +308,97 @@ public void testParsePagesHeadersFootersFootnotes() throws Exception {
308308
// Check for headers, footers and footnotes
309309
assertContains(contents, header);
310310
assertContains(contents, footer);
311+
assertContains(contents, footer2);
311312
assertContains(contents, footnote);
312313
}
313314

315+
/**
316+
* Check we get upper-case Roman numerals within the footer for AutoPageNumber.
317+
*/
318+
public void testParsePagesHeadersFootersRomanUpper() throws Exception {
319+
String header = "THIS IS SOME HEADER TEXT";
320+
String footer = "THIS IS SOME FOOTER TEXT\tI";
321+
String footer2 = "THIS IS SOME FOOTER TEXT\tII";
322+
323+
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanUpper.pages");
324+
ContentHandler handler = new BodyContentHandler();
325+
326+
iWorkParser.parse(input, handler, new Metadata(), parseContext);
327+
String contents = handler.toString();
328+
329+
// Check for headers, footers and footnotes
330+
assertContains(contents, header);
331+
assertContains(contents, footer);
332+
assertContains(contents, footer2);
333+
}
334+
335+
/**
336+
* Check we get lower-case Roman numerals within the footer for AutoPageNumber.
337+
*/
338+
public void testParsePagesHeadersFootersRomanLower() throws Exception {
339+
String header = "THIS IS SOME HEADER TEXT";
340+
String footer = "THIS IS SOME FOOTER TEXT\ti";
341+
String footer2 = "THIS IS SOME FOOTER TEXT\tii";
342+
343+
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanLower.pages");
344+
ContentHandler handler = new BodyContentHandler();
345+
346+
iWorkParser.parse(input, handler, new Metadata(), parseContext);
347+
String contents = handler.toString();
348+
349+
// Check for headers, footers and footnotes
350+
assertContains(contents, header);
351+
assertContains(contents, footer);
352+
assertContains(contents, footer2);
353+
}
354+
355+
/**
356+
* Check we get upper-case alpha-numeric letters within the footer for AutoPageNumber.
357+
*/
358+
public void testParsePagesHeadersAlphaUpper() throws Exception {
359+
String header = "THIS IS SOME HEADER TEXT\tA";
360+
String footer = "THIS IS SOME FOOTER TEXT\tA";
361+
String footer2 = "THIS IS SOME FOOTER TEXT\tB";
362+
363+
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaUpper.pages");
364+
ContentHandler handler = new BodyContentHandler();
365+
366+
iWorkParser.parse(input, handler, new Metadata(), parseContext);
367+
String contents = handler.toString();
368+
369+
// Check for headers, footers and footnotes
370+
assertContains(contents, header);
371+
assertContains(contents, footer);
372+
assertContains(contents, footer2);
373+
}
374+
375+
/**
376+
* Check we get lower-case alpha-numeric letters within the footer for AutoPageNumber.
377+
*/
378+
public void testParsePagesHeadersAlphaLower() throws Exception {
379+
String header = "THIS IS SOME HEADER TEXT";
380+
String footer = "THIS IS SOME FOOTER TEXT\ta";
381+
String footer2 = "THIS IS SOME FOOTER TEXT\tb";
382+
383+
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaLower.pages");
384+
ContentHandler handler = new BodyContentHandler();
385+
386+
iWorkParser.parse(input, handler, new Metadata(), parseContext);
387+
String contents = handler.toString();
388+
389+
// Check for headers, footers and footnotes
390+
assertContains(contents, header);
391+
assertContains(contents, footer);
392+
assertContains(contents, footer2);
393+
}
394+
314395
/**
315396
* Check we get annotations (eg comments) from Pages
316397
*/
317398
public void testParsePagesAnnotations() throws Exception {
318399
String commentA = "comment about the APXL file";
319400
String commentB = "comment about UIMA";
320401

321-
322402
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesComments.pages");
323403
Metadata metadata = new Metadata();
324404
ContentHandler handler = new BodyContentHandler();

0 commit comments

Comments
 (0)