Extract Image from PDF using Java

2019-01-11 19:10发布

问题:

I need to extract bar-code from PDF only (using rectangle), not converting the whole PDF into image.

The image format can be jpg/png.

回答1:

With PDF Box, without coding:

"$JAVA_HOME/bin/java" -jar pdfbox-app-1.8.2.jar PDFToImage foo.pdf

To do a batch processing:

import java.io.File;
import java.io.FilenameFilter;
import java.util.Arrays;
import java.util.List;
import java.util.Observer;

import org.apache.pdfbox.PDFToImage;

public class Main {

   static {
      System.setProperty(
         "org.apache.commons.logging.Log",
         "org.apache.commons.logging.impl.NoOpLog" );
   }

   public static int extract( List< File > files, File jpegDir, Observer observer ) {
      jpegDir.mkdirs();
      int done = 0;
      for( final File file : files ) {
         try {
            final File   target   = new File( jpegDir, file.getName());
            final String trgtPath = target.getPath();
            final String prefix   = trgtPath.substring( 0, trgtPath.lastIndexOf( '.' ));
            PDFToImage.main( new String[]{ "-outputPrefix", prefix, file.getPath() });
            final double percent  = (100.0 * ++done ) / files.size();
            System.out.printf( "%6.2f %%: %s\n", percent, file.getName());
            if( observer != null ) {
               observer.update( null, file );
            }
         }
         catch( final Throwable t ) {
            System.err.println( file.getPath());
            t.printStackTrace();
         }
      }
      return done;
   }

   public static void main( String[] args ) {
      if( args.length != 2 ) {
         System.err.println(
            "usage: java -jar pdf2img.jar <PDF directory> <JPEG directory>" );
         System.exit(1);
      }
      final File   pdfDir  = new File( args[0] );
      final File   jpegDir = new File( args[1] );
      final File[] files   = pdfDir.listFiles( new FilenameFilter() {
         @Override public boolean accept( File dir, String name ) {
            return name.toLowerCase().endsWith( ".pdf" );
         }});
      if( files != null ) {
         final int done = extract( Arrays.asList( files ), jpegDir, null );
         System.out.printf(
            "\n%d file%s processed.", done, ( done > 1 ) ? "s" : "" );
      }
   }
}

This utility may be associated with a GUI (french localized):

public final class GUI extends Application {

   @Override
   public void start( Stage primaryStage ) throws Exception {
      final BorderPane        pane      = new BorderPane();
      final HBox              topPane   = new HBox();
      final Label             lbl       = new Label( "Répertoire des images : " );
      final TextField         jpegDir   = new TextField();
      final Button            browseBtn = new Button( "Parcourir..." );
      final TableView< File > filesVw   = new TableView<>();
      lbl.setAlignment( Pos.CENTER_LEFT );
      lbl      .setStyle( "-fx-padding:8px; -fx-margin:8px;" );
      jpegDir  .setStyle( "-fx-padding:8px; -fx-margin:8px;" );
      browseBtn.setStyle( "-fx-padding:8px; -fx-margin:8px;" );
      topPane.getChildren().addAll( lbl, jpegDir, browseBtn );
      pane.setTop( topPane );
      pane.setCenter( filesVw );
      jpegDir.setPrefColumnCount( 40 );
      jpegDir.setEditable( false );
      final ObservableList< TableColumn< File, ? >> columns = filesVw.getColumns();
      final TableColumn< File, String > name = new TableColumn<>( "Nom" );
      name.setCellValueFactory(
         new Callback< CellDataFeatures< File, String >, ObservableValue< String >>(){
            @Override public ObservableValue< String > call( CellDataFeatures< File, String > p ){
               return new SimpleStringProperty( p.getValue().getName()); }});
      name.setSortable( false );
      name.setPrefWidth( 400 );
      columns.add( name );
      final TableColumn< File, String > size = new TableColumn<>( "Taille" );
      size.setCellValueFactory(
         new Callback< CellDataFeatures< File, String >, ObservableValue< String >>(){
            @Override public ObservableValue< String > call( CellDataFeatures< File, String > p ){
               return new SimpleStringProperty( String.format( "%,12d", p.getValue().length())); }});
      size.setSortable( true );
      size.setPrefWidth( 80 );
      columns.add( size );
      final TableColumn< File, String > date = new TableColumn<>( "Date" );
      final SimpleDateFormat sdf = new SimpleDateFormat( "dd/MM/YYYY HH:mm" );
      date.setCellValueFactory(
         new Callback< CellDataFeatures< File, String >, ObservableValue< String >>(){
            @Override public ObservableValue< String > call( CellDataFeatures< File, String > p ){
               return new SimpleStringProperty(
                  sdf.format( new Date( p.getValue().lastModified()))); }});
      date.setSortable( true );
      date.setPrefWidth( 120 );
      columns.add( date );
      final Map< File, SimpleBooleanProperty > dones = new HashMap<>();
      final TableColumn< File, Boolean > done = new TableColumn<>( "Traité" );
      done.setCellValueFactory(
         new Callback< CellDataFeatures< File, Boolean >, ObservableValue< Boolean >>(){
            @Override public ObservableValue< Boolean > call( CellDataFeatures< File, Boolean > p ){
               return dones.get( p.getValue()); }});
      done.setCellFactory(
         new Callback<TableColumn<File,Boolean>,TableCell<File,Boolean>>(){
            @Override public TableCell<File,Boolean> call( TableColumn<File,Boolean> p ){
               return new CheckBoxTableCell<>(); }});
      done.setSortable( true );
      done.setPrefWidth( 40 );
      columns.add( done );
      jpegDir.setOnDragOver(new EventHandler <DragEvent>() {
         @Override public void handle(DragEvent event) {
            if (event.getGestureSource() != jpegDir ) {
               event.acceptTransferModes(TransferMode.COPY_OR_MOVE);
            }
            event.consume();
         }});
      jpegDir.setOnDragDropped(new EventHandler <DragEvent>() {
         @Override public void handle(DragEvent event) {
            final Dragboard db = event.getDragboard();
            boolean success = false;
            if( db.hasFiles()) {
               jpegDir.setText( db.getFiles().get( 0 ).getPath());
               success = true;
            }
            event.setDropCompleted( success );
            event.consume();
         }});
      filesVw.setOnDragOver(new EventHandler <DragEvent>() {
         @Override public void handle(DragEvent event) {
            if( event.getGestureSource() != filesVw && ! jpegDir.getText().isEmpty()) {
               event.acceptTransferModes(TransferMode.COPY_OR_MOVE);
            }
            event.consume();
         }});
      filesVw.setOnDragDropped(new EventHandler <DragEvent>() {
         @Override public void handle(DragEvent event) {
            final Dragboard db = event.getDragboard();
            boolean success = false;
            if( db.hasFiles()) {
               final List< File > files  = db.getFiles();
               final File         target = new File( jpegDir.getText());
               for( final File f : files ) {
                  dones.put( f, new SimpleBooleanProperty( false ));
               }
               filesVw.getItems().addAll( files );
               filesVw.setDisable( true );
               new Thread(){@Override public void run() {
                  Main.extract(
                     files, target,
                     new Observer(){ @Override public void update( Observable o, final Object file ) {
                        Platform.runLater( new Runnable() { @Override public void run() {
                           dones.get( file ).setValue( Boolean.TRUE );
                        }});
                     }});
                  Platform.runLater( new Runnable() { @Override public void run() {
                     filesVw.setDisable( false );
                  }});
               }}.start();
               success = true;
            }
            event.setDropCompleted( success );
            event.consume();
         }});
      primaryStage.setScene( new Scene( pane ));
      primaryStage.setX( 0 );
      primaryStage.setY( 0 );
      primaryStage.show();
   }

   public static void main( String[] args ) {
      launch();
   }
}


回答2:

You can use Pdfbox

List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
while( iter.hasNext() )
{
    PDPage page = (PDPage)iter.next();
    PDResources resources = page.getResources();
    Map images = resources.getImages();
    if( images != null )
    {
        Iterator imageIter = images.keySet().iterator();
        while( imageIter.hasNext() )
        {
            String key = (String)imageIter.next();
            PDXObjectImage image = (PDXObjectImage)images.get( key );
            String name = getUniqueFileName( key, image.getSuffix() );
            System.out.println( "Writing image:" + name );
            image.write2file( name );
        }
    }
}

Reference source code



回答3:

Try jpedal, that will work. It can extract almost any type of objects (images, text..)

jpedal-Java developer library



回答4:

PDFDecoder API from JPedal will help you extract the words.

// Decode the page              
decodePdf.decodePage(page);

// Create the grouping object to apply grouping to the data
PdfGroupingAlgorithms currentGrouping = decodePdf.getGroupingObject();

// Bounding box for the whole page
PdfPageData currentPageData = decodePdf.getPdfPageData();
int x1 = currentPageData.getMediaBoxX(page);
int x2 = currentPageData.getMediaBoxWidth(page)+x1;
int y2 = currentPageData.getMediaBoxX(page);
int y1 = currentPageData.getMediaBoxHeight(page)-y2;

// Extract words
List words = currentGrouping.extractTextAsWordlist(x1, y1, x2, y2, page, true, "&:=()!;.,\\/\"\"\'\'");

Now, iterate through the list for the words in PDF. Hope it works. Thanks!