Manipulate paths, color etc. in iText

2020-05-01 01:33发布

问题:

I need to analyze path data of PDF files and manipulate content with iText 7. Manipulations include deletion/replacemant and coloring.

I can analyze the graphics alright with something like the following code:

public class ContentParsing {
    public static void main(String[] args) throws IOException {
        new ContentParsing().inspectPdf("testdata/test.pdf");
    }

    public void inspectPdf(String path) throws IOException {
        File file = new File(path);
        PdfDocument pdf = new PdfDocument(new PdfReader(file.getAbsolutePath()));
        PdfDocumentContentParser parser = new PdfDocumentContentParser(pdf);
        for (int i=1; i<=pdf.getNumberOfPages(); i++) {
            parser.processContent(i, new PathEventListener());
        }
        pdf.close();
    }
}


public class PathEventListener implements IEventListener {
    public void eventOccurred(IEventData eventData, EventType eventType) {
        PathRenderInfo pathRenderInfo = (PathRenderInfo) eventData;
        for ( Subpath subpath : pathRenderInfo.getPath().getSubpaths() ) {
            for ( IShape segment : subpath.getSegments() ) {
                // Here goes some path analysis code
                System.out.println(segment.getBasePoints());
            }
        }
    }

    public Set<EventType> getSupportedEvents() {
        Set<EventType> supportedEvents = new HashSet<EventType>();
        supportedEvents.add(EventType.RENDER_PATH);
        return supportedEvents;
    }
}

Now, what's the way to go with manipulating things and writing them back to the PDF? Do I have to construct an entirely new PDF document and copy everything over (in manipulated form), or can I somehow manipulate the read PDF data directly?

回答1:

Now, what's the way to go with manipulating things and writing them back to the PDF? Do I have to construct an entirely new PDF document and copy everything over (in manipulated form), or can I somehow manipulate the read PDF data directly?

In essence you are looking for a class which is not merely parsing a PDF content stream and signaling the instructions in it like the PdfCanvasProcessor (the PdfDocumentContentParser you use is merely a very thin wrapper for PdfCanvasProcessor) but which also creates the content stream anew with the instructions you forward back to it.

A generic content stream editor class

For iText 5.5.x a proof-of-concept for such a content stream editor class can be found in this answer (the Java version is a bit further down in the answer text).

This is a port of that proof-of-concept to iText 7:

public class PdfCanvasEditor extends PdfCanvasProcessor
{
    /**
     * This method edits the immediate contents of a page, i.e. its content stream.
     * It explicitly does not descent into form xobjects, patterns, or annotations.
     */
    public void editPage(PdfDocument pdfDocument, int pageNumber) throws IOException
    {
        if ((pdfDocument.getReader() == null) || (pdfDocument.getWriter() == null))
        {
            throw new PdfException("PdfDocument must be opened in stamping mode.");
        }

        PdfPage page = pdfDocument.getPage(pageNumber);
        PdfResources pdfResources = page.getResources();
        PdfCanvas pdfCanvas = new PdfCanvas(new PdfStream(), pdfResources, pdfDocument);
        editContent(page.getContentBytes(), pdfResources, pdfCanvas);
        page.put(PdfName.Contents, pdfCanvas.getContentStream());
    }

    /**
     * This method processes the content bytes and outputs to the given canvas.
     * It explicitly does not descent into form xobjects, patterns, or annotations.
     */
    public void editContent(byte[] contentBytes, PdfResources resources, PdfCanvas canvas)
    {
        this.canvas = canvas;
        processContent(contentBytes, resources);
        this.canvas = null;
    }

    /**
     * <p>
     * This method writes content stream operations to the target canvas. The default
     * implementation writes them as they come, so it essentially generates identical
     * copies of the original instructions the {@link ContentOperatorWrapper} instances
     * forward to it.
     * </p>
     * <p>
     * Override this method to achieve some fancy editing effect.
     * </p> 
     */
    protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
    {
        PdfOutputStream pdfOutputStream = canvas.getContentStream().getOutputStream();
        int index = 0;

        for (PdfObject object : operands)
        {
            pdfOutputStream.write(object);
            if (operands.size() > ++index)
                pdfOutputStream.writeSpace();
            else
                pdfOutputStream.writeNewLine();
        }
    }

    //
    // constructor giving the parent a dummy listener to talk to 
    //
    public PdfCanvasEditor()
    {
        super(new DummyEventListener());
    }

    //
    // Overrides of PdfContentStreamProcessor methods
    //
    @Override
    public IContentOperator registerContentOperator(String operatorString, IContentOperator operator)
    {
        ContentOperatorWrapper wrapper = new ContentOperatorWrapper();
        wrapper.setOriginalOperator(operator);
        IContentOperator formerOperator = super.registerContentOperator(operatorString, wrapper);
        return formerOperator instanceof ContentOperatorWrapper ? ((ContentOperatorWrapper)formerOperator).getOriginalOperator() : formerOperator;
    }

    //
    // members holding the output canvas and the resources
    //
    protected PdfCanvas canvas = null;

    //
    // A content operator class to wrap all content operators to forward the invocation to the editor
    //
    class ContentOperatorWrapper implements IContentOperator
    {
        public IContentOperator getOriginalOperator()
        {
            return originalOperator;
        }

        public void setOriginalOperator(IContentOperator originalOperator)
        {
            this.originalOperator = originalOperator;
        }

        @Override
        public void invoke(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
        {
            if (originalOperator != null && !"Do".equals(operator.toString()))
            {
                originalOperator.invoke(processor, operator, operands);
            }
            write(processor, operator, operands);
        }

        private IContentOperator originalOperator = null;
    }

    //
    // A dummy event listener to give to the underlying canvas processor to feed events to
    //
    static class DummyEventListener implements IEventListener
    {
        @Override
        public void eventOccurred(IEventData data, EventType type)
        { }

        @Override
        public Set<EventType> getSupportedEvents()
        {
            return null;
        }
    }
}

(PdfCanvasEditor.java)

The explanations from the iText 5 answer still apply, the parsing framework has not changed much from iText 5.5.x to iText 7.0.x.

Usage examples

Unfortunately you wrote in very vague terms about how exactly you want to change the contents. Thus I simply ported some iText 5 samples which made use of the original iText 5 content stream editor class:

Watermark removal

These are ports of the use cases in this answer.

testRemoveBoldMTTextDocument

This example drops all text written in a font the name of which ends with "BoldMT":

try (   InputStream resource = getClass().getResourceAsStream("document.pdf");
        PdfReader pdfReader = new PdfReader(resource);
        OutputStream result = new FileOutputStream(new File(RESULT_FOLDER, "document-noBoldMTText.pdf"));
        PdfWriter pdfWriter = new PdfWriter(result);
        PdfDocument pdfDocument = new PdfDocument(pdfReader, pdfWriter) )
{
    PdfCanvasEditor editor = new PdfCanvasEditor()
    {

        @Override
        protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
        {
            String operatorString = operator.toString();

            if (TEXT_SHOWING_OPERATORS.contains(operatorString))
            {
                if (getGraphicsState().getFont().getFontProgram().getFontNames().getFontName().endsWith("BoldMT"))
                    return;
            }

            super.write(processor, operator, operands);
        }

        final List<String> TEXT_SHOWING_OPERATORS = Arrays.asList("Tj", "'", "\"", "TJ");
    };
    for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
    {
        editor.editPage(pdfDocument, i);
    }
}

(EditPageContent.java test method testRemoveBoldMTTextDocument)

testRemoveBigTextDocument

This example drops all text written with a large font size:

try (   InputStream resource = getClass().getResourceAsStream("document.pdf");
        PdfReader pdfReader = new PdfReader(resource);
        OutputStream result = new FileOutputStream(new File(RESULT_FOLDER, "document-noBigText.pdf"));
        PdfWriter pdfWriter = new PdfWriter(result);
        PdfDocument pdfDocument = new PdfDocument(pdfReader, pdfWriter) )
{
    PdfCanvasEditor editor = new PdfCanvasEditor()
    {

        @Override
        protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
        {
            String operatorString = operator.toString();

            if (TEXT_SHOWING_OPERATORS.contains(operatorString))
            {
                if (getGraphicsState().getFontSize() > 100)
                    return;
            }

            super.write(processor, operator, operands);
        }

        final List<String> TEXT_SHOWING_OPERATORS = Arrays.asList("Tj", "'", "\"", "TJ");
    };
    for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
    {
        editor.editPage(pdfDocument, i);
    }
}

(EditPageContent.java test method testRemoveBigTextDocument)

Text color change

This is a port of the use case in this answer.

testChangeBlackTextToGreenDocument

This example changes the color of black text to green.

try (   InputStream resource = getClass().getResourceAsStream("document.pdf");
        PdfReader pdfReader = new PdfReader(resource);
        OutputStream result = new FileOutputStream(new File(RESULT_FOLDER, "document-blackTextToGreen.pdf"));
        PdfWriter pdfWriter = new PdfWriter(result);
        PdfDocument pdfDocument = new PdfDocument(pdfReader, pdfWriter) )
{
    PdfCanvasEditor editor = new PdfCanvasEditor()
    {

        @Override
        protected void write(PdfCanvasProcessor processor, PdfLiteral operator, List<PdfObject> operands)
        {
            String operatorString = operator.toString();

            if (TEXT_SHOWING_OPERATORS.contains(operatorString))
            {
                if (currentlyReplacedBlack == null)
                {
                    Color currentFillColor = getGraphicsState().getFillColor();
                    if (Color.BLACK.equals(currentFillColor))
                    {
                        currentlyReplacedBlack = currentFillColor;
                        super.write(processor, new PdfLiteral("rg"), Arrays.asList(new PdfNumber(0), new PdfNumber(1), new PdfNumber(0), new PdfLiteral("rg")));
                    }
                }
            }
            else if (currentlyReplacedBlack != null)
            {
                if (currentlyReplacedBlack instanceof DeviceCmyk)
                {
                    super.write(processor, new PdfLiteral("k"), Arrays.asList(new PdfNumber(0), new PdfNumber(0), new PdfNumber(0), new PdfNumber(1), new PdfLiteral("k")));
                }
                else if (currentlyReplacedBlack instanceof DeviceGray)
                {
                    super.write(processor, new PdfLiteral("g"), Arrays.asList(new PdfNumber(0), new PdfLiteral("g")));
                }
                else
                {
                    super.write(processor, new PdfLiteral("rg"), Arrays.asList(new PdfNumber(0), new PdfNumber(0), new PdfNumber(0), new PdfLiteral("rg")));
                }
                currentlyReplacedBlack = null;
            }

            super.write(processor, operator, operands);
        }

        Color currentlyReplacedBlack = null;

        final List<String> TEXT_SHOWING_OPERATORS = Arrays.asList("Tj", "'", "\"", "TJ");
    };
    for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
    {
        editor.editPage(pdfDocument, i);
    }
}

(EditPageContent.java test method testChangeBlackTextToGreenDocument)



标签: itext itext7