diff --git a/openpdf/src/main/java/com/lowagie/text/pdf/parser/PdfContentStreamHandler.java b/openpdf/src/main/java/com/lowagie/text/pdf/parser/PdfContentStreamHandler.java index 2c443c501..bdd57e2c9 100644 --- a/openpdf/src/main/java/com/lowagie/text/pdf/parser/PdfContentStreamHandler.java +++ b/openpdf/src/main/java/com/lowagie/text/pdf/parser/PdfContentStreamHandler.java @@ -41,25 +41,34 @@ */ package com.lowagie.text.pdf.parser; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.ListIterator; +import java.util.Map; +import java.util.Stack; + +import com.lowagie.text.ExceptionConverter; import com.lowagie.text.error_messages.MessageLocalization; import com.lowagie.text.pdf.CMapAwareDocumentFont; import com.lowagie.text.pdf.PRIndirectReference; +import com.lowagie.text.pdf.PRStream; +import com.lowagie.text.pdf.PRTokeniser; import com.lowagie.text.pdf.PdfArray; +import com.lowagie.text.pdf.PdfContentParser; import com.lowagie.text.pdf.PdfDictionary; import com.lowagie.text.pdf.PdfIndirectReference; import com.lowagie.text.pdf.PdfLiteral; import com.lowagie.text.pdf.PdfName; import com.lowagie.text.pdf.PdfNumber; import com.lowagie.text.pdf.PdfObject; +import com.lowagie.text.pdf.PdfReader; +import com.lowagie.text.pdf.PdfStream; import com.lowagie.text.pdf.PdfString; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import java.util.Stack; - /** * @author dgd */ @@ -631,7 +640,7 @@ public void invoke(ArrayList operands, public Matrix textLineMatrix; boolean useContainerMarkup; - + /** * detail parser for text within a marked section. used by TextAssembler */ @@ -780,6 +789,75 @@ public void invoke(ArrayList operands, handler.popContext(); } } + + private class Do implements ContentOperator { + /** + * @see com.lowagie.text.pdf.parser.ContentOperator#getOperatorName() + */ + @Override + public String getOperatorName() { + return "Do"; + } + + @Override + public void invoke(ArrayList operands, PdfContentStreamHandler handler, PdfDictionary resources) { + PdfObject firstOperand = operands.get(0); + if (firstOperand instanceof PdfName) { + PdfName name = (PdfName) firstOperand; + PdfDictionary dictionary = resources.getAsDict(PdfName.XOBJECT); + if (dictionary == null) { + return; + } + PdfStream stream = (PdfStream) dictionary.getDirectObject(name); + PdfName subType = stream.getAsName(PdfName.SUBTYPE); + if (PdfName.FORM.equals(subType)) { + PdfDictionary resources2 = stream.getAsDict(PdfName.RESOURCES); + byte[] data = null; + try { + data = getContentBytesFromPdfObject(stream); + } catch (IOException ex) { + throw new ExceptionConverter(ex); + } + new PushGraphicsState().invoke(operands, handler, resources); + processContent(data, resources2); + new PopGraphicsState().invoke(operands, handler, resources); + } + } + + } + private void processContent(byte[] contentBytes, PdfDictionary resources) { + try { + PdfContentParser ps = new PdfContentParser(new PRTokeniser(contentBytes)); + ArrayList operands = new ArrayList(); + while (ps.parse(operands).size() > 0) { + PdfLiteral operator = (PdfLiteral) operands.get(operands.size() - 1); + invokeOperator(operator, operands, resources); + } + } catch (Exception e) { + throw new ExceptionConverter(e); + } + } + + + private byte[] getContentBytesFromPdfObject(PdfObject object) throws IOException { + switch (object.type()) { + case PdfObject.INDIRECT: + return getContentBytesFromPdfObject(PdfReader.getPdfObject(object)); + case PdfObject.STREAM: + return PdfReader.getStreamBytes((PRStream) PdfReader.getPdfObject(object)); + case PdfObject.ARRAY: + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ListIterator iter = ((PdfArray) object).listIterator(); + while (iter.hasNext()) { + PdfObject element = iter.next(); + baos.write(getContentBytesFromPdfObject(element)); + } + return baos.toByteArray(); + default: + throw new IllegalStateException("Unsupported type: " + object.getClass().getCanonicalName()); + } + } + } /** * Loads all the supported graphics and text state operators in a map. @@ -827,6 +905,8 @@ protected void installDefaultOperators() { registerContentOperator(new BeginMarked()); registerContentOperator(new BeginMarkedDict()); registerContentOperator(new EndMarked()); + + registerContentOperator(new Do()); } /** diff --git a/openpdf/src/test/java/com/lowagie/text/pdf/parser/PdfTextExtractorTest.java b/openpdf/src/test/java/com/lowagie/text/pdf/parser/PdfTextExtractorTest.java new file mode 100644 index 000000000..26a159598 --- /dev/null +++ b/openpdf/src/test/java/com/lowagie/text/pdf/parser/PdfTextExtractorTest.java @@ -0,0 +1,67 @@ +package com.lowagie.text.pdf.parser; + + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; + +import org.junit.Assert; +import org.junit.Test; + +import com.lowagie.text.pdf.PdfReader; + + +public class PdfTextExtractorTest { + + + @Test(expected=IOException.class) + public void testPageExceeded() throws Exception { + getString("HelloWorldMeta.pdf", 5); + } + @Test(expected=IOException.class) + public void testInvalidPageNumber() throws Exception { + getString("HelloWorldMeta.pdf", 0); + } + + + @Test + public void testConcatenateWatermark() throws Exception { + String result = getString("merge-acroforms.pdf", 5); + Assert.assertNotNull(result); + // html?? + result = result.replaceAll("\\<.*?>",""); + // Multiple spaces betwen words?? + Assert.assertTrue(result.contains("2. This is chapter 2")); + Assert.assertTrue(result.contains("watermark-concatenate")); + } + + + private String getString(String fileName, int pageNumber) throws Exception { + return getString(new File("src/test/resources", fileName), pageNumber); + } + private String getString(File file, int pageNumber) throws Exception { + byte[] pdfBytes = readDocument(file); + final PdfReader pdfReader = new PdfReader(pdfBytes); + + return new PdfTextExtractor(pdfReader).getTextFromPage( pageNumber); + } + + protected static byte[] readDocument(final File file) throws IOException { + + try (ByteArrayOutputStream fileBytes = new ByteArrayOutputStream(); + InputStream inputStream = new FileInputStream(file)) { + final byte[] buffer = new byte[8192]; + while (true) { + final int bytesRead = inputStream.read(buffer); + if (bytesRead == -1) { + break; + } + fileBytes.write(buffer, 0, bytesRead); + } + return fileBytes.toByteArray(); + } + + } +} diff --git a/openpdf/src/test/resources/merge-acroforms.pdf b/openpdf/src/test/resources/merge-acroforms.pdf new file mode 100644 index 000000000..4d3d09814 Binary files /dev/null and b/openpdf/src/test/resources/merge-acroforms.pdf differ