Skip to content

Commit

Permalink
Merge pull request LibrePDF#64 from albfernandez/extract_text_from_forms
Browse files Browse the repository at this point in the history
Extract text from forms
  • Loading branch information
asturio authored Feb 4, 2018
2 parents d6a792b + 8766d17 commit d39a65f
Show file tree
Hide file tree
Showing 3 changed files with 155 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,25 +41,34 @@
*/
package com.lowagie.text.pdf.parser;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.ListIterator;
import java.util.Map;
import java.util.Stack;

import com.lowagie.text.ExceptionConverter;
import com.lowagie.text.error_messages.MessageLocalization;
import com.lowagie.text.pdf.CMapAwareDocumentFont;
import com.lowagie.text.pdf.PRIndirectReference;
import com.lowagie.text.pdf.PRStream;
import com.lowagie.text.pdf.PRTokeniser;
import com.lowagie.text.pdf.PdfArray;
import com.lowagie.text.pdf.PdfContentParser;
import com.lowagie.text.pdf.PdfDictionary;
import com.lowagie.text.pdf.PdfIndirectReference;
import com.lowagie.text.pdf.PdfLiteral;
import com.lowagie.text.pdf.PdfName;
import com.lowagie.text.pdf.PdfNumber;
import com.lowagie.text.pdf.PdfObject;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.PdfStream;
import com.lowagie.text.pdf.PdfString;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Stack;

/**
* @author dgd
*/
Expand Down Expand Up @@ -631,7 +640,7 @@ public void invoke(ArrayList<PdfObject> operands,
public Matrix textLineMatrix;

boolean useContainerMarkup;

/**
* detail parser for text within a marked section. used by TextAssembler
*/
Expand Down Expand Up @@ -780,6 +789,75 @@ public void invoke(ArrayList<PdfObject> operands,
handler.popContext();
}
}

private class Do implements ContentOperator {
/**
* @see com.lowagie.text.pdf.parser.ContentOperator#getOperatorName()
*/
@Override
public String getOperatorName() {
return "Do";
}

@Override
public void invoke(ArrayList<PdfObject> operands, PdfContentStreamHandler handler, PdfDictionary resources) {
PdfObject firstOperand = operands.get(0);
if (firstOperand instanceof PdfName) {
PdfName name = (PdfName) firstOperand;
PdfDictionary dictionary = resources.getAsDict(PdfName.XOBJECT);
if (dictionary == null) {
return;
}
PdfStream stream = (PdfStream) dictionary.getDirectObject(name);
PdfName subType = stream.getAsName(PdfName.SUBTYPE);
if (PdfName.FORM.equals(subType)) {
PdfDictionary resources2 = stream.getAsDict(PdfName.RESOURCES);
byte[] data = null;
try {
data = getContentBytesFromPdfObject(stream);
} catch (IOException ex) {
throw new ExceptionConverter(ex);
}
new PushGraphicsState().invoke(operands, handler, resources);
processContent(data, resources2);
new PopGraphicsState().invoke(operands, handler, resources);
}
}

}
private void processContent(byte[] contentBytes, PdfDictionary resources) {
try {
PdfContentParser ps = new PdfContentParser(new PRTokeniser(contentBytes));
ArrayList<PdfObject> operands = new ArrayList<PdfObject>();
while (ps.parse(operands).size() > 0) {
PdfLiteral operator = (PdfLiteral) operands.get(operands.size() - 1);
invokeOperator(operator, operands, resources);
}
} catch (Exception e) {
throw new ExceptionConverter(e);
}
}


private byte[] getContentBytesFromPdfObject(PdfObject object) throws IOException {
switch (object.type()) {
case PdfObject.INDIRECT:
return getContentBytesFromPdfObject(PdfReader.getPdfObject(object));
case PdfObject.STREAM:
return PdfReader.getStreamBytes((PRStream) PdfReader.getPdfObject(object));
case PdfObject.ARRAY:
ByteArrayOutputStream baos = new ByteArrayOutputStream();
ListIterator<PdfObject> iter = ((PdfArray) object).listIterator();
while (iter.hasNext()) {
PdfObject element = iter.next();
baos.write(getContentBytesFromPdfObject(element));
}
return baos.toByteArray();
default:
throw new IllegalStateException("Unsupported type: " + object.getClass().getCanonicalName());
}
}
}

/**
* Loads all the supported graphics and text state operators in a map.
Expand Down Expand Up @@ -827,6 +905,8 @@ protected void installDefaultOperators() {
registerContentOperator(new BeginMarked());
registerContentOperator(new BeginMarkedDict());
registerContentOperator(new EndMarked());

registerContentOperator(new Do());
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package com.lowagie.text.pdf.parser;


import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;

import org.junit.Assert;
import org.junit.Test;

import com.lowagie.text.pdf.PdfReader;


public class PdfTextExtractorTest {


@Test(expected=IOException.class)
public void testPageExceeded() throws Exception {
getString("HelloWorldMeta.pdf", 5);
}
@Test(expected=IOException.class)
public void testInvalidPageNumber() throws Exception {
getString("HelloWorldMeta.pdf", 0);
}


@Test
public void testConcatenateWatermark() throws Exception {
String result = getString("merge-acroforms.pdf", 5);
Assert.assertNotNull(result);
// html??
result = result.replaceAll("\\<.*?>","");
// Multiple spaces betwen words??
Assert.assertTrue(result.contains("2. This is chapter 2"));
Assert.assertTrue(result.contains("watermark-concatenate"));
}


private String getString(String fileName, int pageNumber) throws Exception {
return getString(new File("src/test/resources", fileName), pageNumber);
}
private String getString(File file, int pageNumber) throws Exception {
byte[] pdfBytes = readDocument(file);
final PdfReader pdfReader = new PdfReader(pdfBytes);

return new PdfTextExtractor(pdfReader).getTextFromPage( pageNumber);
}

protected static byte[] readDocument(final File file) throws IOException {

try (ByteArrayOutputStream fileBytes = new ByteArrayOutputStream();
InputStream inputStream = new FileInputStream(file)) {
final byte[] buffer = new byte[8192];
while (true) {
final int bytesRead = inputStream.read(buffer);
if (bytesRead == -1) {
break;
}
fileBytes.write(buffer, 0, bytesRead);
}
return fileBytes.toByteArray();
}

}
}
Binary file added openpdf/src/test/resources/merge-acroforms.pdf
Binary file not shown.

0 comments on commit d39a65f

Please sign in to comment.