I need to extract bar-code from PDF only (using rectangle), not converting the whole PDF into image.
The image format can be jpg/png.
I need to extract bar-code from PDF only (using rectangle), not converting the whole PDF into image.
The image format can be jpg/png.
With PDF Box, without coding:
"$JAVA_HOME/bin/java" -jar pdfbox-app-1.8.2.jar PDFToImage foo.pdf
To do a batch processing:
import java.io.File;
import java.io.FilenameFilter;
import java.util.Arrays;
import java.util.List;
import java.util.Observer;
import org.apache.pdfbox.PDFToImage;
public class Main {
static {
System.setProperty(
"org.apache.commons.logging.Log",
"org.apache.commons.logging.impl.NoOpLog" );
}
public static int extract( List< File > files, File jpegDir, Observer observer ) {
jpegDir.mkdirs();
int done = 0;
for( final File file : files ) {
try {
final File target = new File( jpegDir, file.getName());
final String trgtPath = target.getPath();
final String prefix = trgtPath.substring( 0, trgtPath.lastIndexOf( '.' ));
PDFToImage.main( new String[]{ "-outputPrefix", prefix, file.getPath() });
final double percent = (100.0 * ++done ) / files.size();
System.out.printf( "%6.2f %%: %s\n", percent, file.getName());
if( observer != null ) {
observer.update( null, file );
}
}
catch( final Throwable t ) {
System.err.println( file.getPath());
t.printStackTrace();
}
}
return done;
}
public static void main( String[] args ) {
if( args.length != 2 ) {
System.err.println(
"usage: java -jar pdf2img.jar <PDF directory> <JPEG directory>" );
System.exit(1);
}
final File pdfDir = new File( args[0] );
final File jpegDir = new File( args[1] );
final File[] files = pdfDir.listFiles( new FilenameFilter() {
@Override public boolean accept( File dir, String name ) {
return name.toLowerCase().endsWith( ".pdf" );
}});
if( files != null ) {
final int done = extract( Arrays.asList( files ), jpegDir, null );
System.out.printf(
"\n%d file%s processed.", done, ( done > 1 ) ? "s" : "" );
}
}
}
This utility may be associated with a GUI (french localized):
public final class GUI extends Application {
@Override
public void start( Stage primaryStage ) throws Exception {
final BorderPane pane = new BorderPane();
final HBox topPane = new HBox();
final Label lbl = new Label( "Répertoire des images : " );
final TextField jpegDir = new TextField();
final Button browseBtn = new Button( "Parcourir..." );
final TableView< File > filesVw = new TableView<>();
lbl.setAlignment( Pos.CENTER_LEFT );
lbl .setStyle( "-fx-padding:8px; -fx-margin:8px;" );
jpegDir .setStyle( "-fx-padding:8px; -fx-margin:8px;" );
browseBtn.setStyle( "-fx-padding:8px; -fx-margin:8px;" );
topPane.getChildren().addAll( lbl, jpegDir, browseBtn );
pane.setTop( topPane );
pane.setCenter( filesVw );
jpegDir.setPrefColumnCount( 40 );
jpegDir.setEditable( false );
final ObservableList< TableColumn< File, ? >> columns = filesVw.getColumns();
final TableColumn< File, String > name = new TableColumn<>( "Nom" );
name.setCellValueFactory(
new Callback< CellDataFeatures< File, String >, ObservableValue< String >>(){
@Override public ObservableValue< String > call( CellDataFeatures< File, String > p ){
return new SimpleStringProperty( p.getValue().getName()); }});
name.setSortable( false );
name.setPrefWidth( 400 );
columns.add( name );
final TableColumn< File, String > size = new TableColumn<>( "Taille" );
size.setCellValueFactory(
new Callback< CellDataFeatures< File, String >, ObservableValue< String >>(){
@Override public ObservableValue< String > call( CellDataFeatures< File, String > p ){
return new SimpleStringProperty( String.format( "%,12d", p.getValue().length())); }});
size.setSortable( true );
size.setPrefWidth( 80 );
columns.add( size );
final TableColumn< File, String > date = new TableColumn<>( "Date" );
final SimpleDateFormat sdf = new SimpleDateFormat( "dd/MM/YYYY HH:mm" );
date.setCellValueFactory(
new Callback< CellDataFeatures< File, String >, ObservableValue< String >>(){
@Override public ObservableValue< String > call( CellDataFeatures< File, String > p ){
return new SimpleStringProperty(
sdf.format( new Date( p.getValue().lastModified()))); }});
date.setSortable( true );
date.setPrefWidth( 120 );
columns.add( date );
final Map< File, SimpleBooleanProperty > dones = new HashMap<>();
final TableColumn< File, Boolean > done = new TableColumn<>( "Traité" );
done.setCellValueFactory(
new Callback< CellDataFeatures< File, Boolean >, ObservableValue< Boolean >>(){
@Override public ObservableValue< Boolean > call( CellDataFeatures< File, Boolean > p ){
return dones.get( p.getValue()); }});
done.setCellFactory(
new Callback<TableColumn<File,Boolean>,TableCell<File,Boolean>>(){
@Override public TableCell<File,Boolean> call( TableColumn<File,Boolean> p ){
return new CheckBoxTableCell<>(); }});
done.setSortable( true );
done.setPrefWidth( 40 );
columns.add( done );
jpegDir.setOnDragOver(new EventHandler <DragEvent>() {
@Override public void handle(DragEvent event) {
if (event.getGestureSource() != jpegDir ) {
event.acceptTransferModes(TransferMode.COPY_OR_MOVE);
}
event.consume();
}});
jpegDir.setOnDragDropped(new EventHandler <DragEvent>() {
@Override public void handle(DragEvent event) {
final Dragboard db = event.getDragboard();
boolean success = false;
if( db.hasFiles()) {
jpegDir.setText( db.getFiles().get( 0 ).getPath());
success = true;
}
event.setDropCompleted( success );
event.consume();
}});
filesVw.setOnDragOver(new EventHandler <DragEvent>() {
@Override public void handle(DragEvent event) {
if( event.getGestureSource() != filesVw && ! jpegDir.getText().isEmpty()) {
event.acceptTransferModes(TransferMode.COPY_OR_MOVE);
}
event.consume();
}});
filesVw.setOnDragDropped(new EventHandler <DragEvent>() {
@Override public void handle(DragEvent event) {
final Dragboard db = event.getDragboard();
boolean success = false;
if( db.hasFiles()) {
final List< File > files = db.getFiles();
final File target = new File( jpegDir.getText());
for( final File f : files ) {
dones.put( f, new SimpleBooleanProperty( false ));
}
filesVw.getItems().addAll( files );
filesVw.setDisable( true );
new Thread(){@Override public void run() {
Main.extract(
files, target,
new Observer(){ @Override public void update( Observable o, final Object file ) {
Platform.runLater( new Runnable() { @Override public void run() {
dones.get( file ).setValue( Boolean.TRUE );
}});
}});
Platform.runLater( new Runnable() { @Override public void run() {
filesVw.setDisable( false );
}});
}}.start();
success = true;
}
event.setDropCompleted( success );
event.consume();
}});
primaryStage.setScene( new Scene( pane ));
primaryStage.setX( 0 );
primaryStage.setY( 0 );
primaryStage.show();
}
public static void main( String[] args ) {
launch();
}
}
You can use Pdfbox
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
while( iter.hasNext() )
{
PDPage page = (PDPage)iter.next();
PDResources resources = page.getResources();
Map images = resources.getImages();
if( images != null )
{
Iterator imageIter = images.keySet().iterator();
while( imageIter.hasNext() )
{
String key = (String)imageIter.next();
PDXObjectImage image = (PDXObjectImage)images.get( key );
String name = getUniqueFileName( key, image.getSuffix() );
System.out.println( "Writing image:" + name );
image.write2file( name );
}
}
}
Reference source code
Try jpedal, that will work. It can extract almost any type of objects (images, text..)
jpedal-Java developer library
PDFDecoder API from JPedal will help you extract the words.
// Decode the page
decodePdf.decodePage(page);
// Create the grouping object to apply grouping to the data
PdfGroupingAlgorithms currentGrouping = decodePdf.getGroupingObject();
// Bounding box for the whole page
PdfPageData currentPageData = decodePdf.getPdfPageData();
int x1 = currentPageData.getMediaBoxX(page);
int x2 = currentPageData.getMediaBoxWidth(page)+x1;
int y2 = currentPageData.getMediaBoxX(page);
int y1 = currentPageData.getMediaBoxHeight(page)-y2;
// Extract words
List words = currentGrouping.extractTextAsWordlist(x1, y1, x2, y2, page, true, "&:=()!;.,\\/\"\"\'\'");
Now, iterate through the list for the words in PDF. Hope it works. Thanks!