我使用PDFBOX提取单词/字符串的PDF文档中的坐标,迄今已经成功确定单个字符的位置。 这是代码迄今,从PDFBOX DOC:
package printtextlocations;
import java.io.*;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
import java.io.IOException;
import java.util.List;
public class PrintTextLocations extends PDFTextStripper {
public PrintTextLocations() throws IOException {
super.setSortByPosition(true);
}
public static void main(String[] args) throws Exception {
PDDocument document = null;
try {
File input = new File("C:\\path\\to\\PDF.pdf");
document = PDDocument.load(input);
if (document.isEncrypted()) {
try {
document.decrypt("");
} catch (InvalidPasswordException e) {
System.err.println("Error: Document is encrypted with a password.");
System.exit(1);
}
}
PrintTextLocations printer = new PrintTextLocations();
List allPages = document.getDocumentCatalog().getAllPages();
for (int i = 0; i < allPages.size(); i++) {
PDPage page = (PDPage) allPages.get(i);
System.out.println("Processing page: " + i);
PDStream contents = page.getContents();
if (contents != null) {
printer.processStream(page, page.findResources(), page.getContents().getStream());
}
}
} finally {
if (document != null) {
document.close();
}
}
}
/**
* @param text The text to be processed
*/
@Override /* this is questionable, not sure if needed... */
protected void processTextPosition(TextPosition text) {
System.out.println("String[" + text.getXDirAdj() + ","
+ text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale="
+ text.getXScale() + " height=" + text.getHeightDir() + " space="
+ text.getWidthOfSpace() + " width="
+ text.getWidthDirAdj() + "]" + text.getCharacter());
}
}
这产生了一系列含有每个字符,包括空格,看起来像这样的位置的行:
String[202.5604,41.880127 fs=1.0 xscale=13.98 height=9.68814 space=3.8864403 width=9.324661]P
其中“P”是字符。 我一直没能找到PDFBOX功能找到的话,我没有足够的与Java熟悉,能够准确地串连这些字符回通过的话,即使也被包括在空间进行搜索。 任何人都有过类似的情况,如果是你怎么处理它? 我真的只需要使零件简化了第一个字符的字坐标,但至于如何我要去一个字符串中的那种输出的匹配是超越我。
有一个在PDFBox的任何功能,可以自动提取的话。 我目前正在对提取数据,将其收集成块,这里是我的过程:
我解压文件(称为字形)的所有字符,并将它们存储在列表中。
我做的每一个字形的坐标的分析,遍历列表。 如果它们重叠(如果当前字形的顶部被容纳在顶部和前面/或当前字形的底部被包含前一个的顶部和底部之间的底部之间),I将其添加到同一行。
在这一点上,我已提取的文件的不同线路(小心,如果文档是多列,表达“线”是指所有垂直地重叠字形,即,所有具有相同的垂直列的文本坐标)。
然后,你可以比较前面的一个坐标,以确定它们是否属于同一个单词或不是当前字形向右的左侧坐标(在PDFTextStripper类提供了一个getSpacingTolerance()方法,让你的基础上,试验和错误,“正常”的空间值,如果右和左坐标之间的差值低于此值,字形都属于同一个字。
我应用这个方法对我的工作和它的作品好。
基于这里的最初的想法是文本搜索PDFBox的2代码本身的一个版本是粗糙的,但简单。 它应该让你快速上手相当。
import java.io.IOException;
import java.io.Writer;
import java.util.List;
import java.util.Set;
import lu.abac.pdfclient.data.PDFTextLocation;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
public class PrintTextLocator extends PDFTextStripper {
private final Set<PDFTextLocation> locations;
public PrintTextLocator(PDDocument document, Set<PDFTextLocation> locations) throws IOException {
super.setSortByPosition(true);
this.document = document;
this.locations = locations;
this.output = new Writer() {
@Override
public void write(char[] cbuf, int off, int len) throws IOException {
}
@Override
public void flush() throws IOException {
}
@Override
public void close() throws IOException {
}
};
}
public Set<PDFTextLocation> doSearch() throws IOException {
processPages(document.getDocumentCatalog().getPages());
return locations;
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
super.writeString(text);
String searchText = text.toLowerCase();
for (PDFTextLocation textLoc:locations) {
int start = searchText.indexOf(textLoc.getText().toLowerCase());
if (start!=-1) {
// found
TextPosition pos = textPositions.get(start);
textLoc.setFound(true);
textLoc.setPage(getCurrentPageNo());
textLoc.setX(pos.getXDirAdj());
textLoc.setY(pos.getYDirAdj());
}
}
}
}
承担这一看,我认为这是你所需要的。
https://jackson-brain.com/using-pdfbox-to-locate-text-coordinates-within-a-pdf-in-java/
下面是代码:
import java.io.File;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
public class PrintTextLocations extends PDFTextStripper {
public static StringBuilder tWord = new StringBuilder();
public static String seek;
public static String[] seekA;
public static List wordList = new ArrayList();
public static boolean is1stChar = true;
public static boolean lineMatch;
public static int pageNo = 1;
public static double lastYVal;
public PrintTextLocations()
throws IOException {
super.setSortByPosition(true);
}
public static void main(String[] args)
throws Exception {
PDDocument document = null;
seekA = args[1].split(",");
seek = args[1];
try {
File input = new File(args[0]);
document = PDDocument.load(input);
if (document.isEncrypted()) {
try {
document.decrypt("");
} catch (InvalidPasswordException e) {
System.err.println("Error: Document is encrypted with a password.");
System.exit(1);
}
}
PrintTextLocations printer = new PrintTextLocations();
List allPages = document.getDocumentCatalog().getAllPages();
for (int i = 0; i < allPages.size(); i++) {
PDPage page = (PDPage) allPages.get(i);
PDStream contents = page.getContents();
if (contents != null) {
printer.processStream(page, page.findResources(), page.getContents().getStream());
}
pageNo += 1;
}
} finally {
if (document != null) {
System.out.println(wordList);
document.close();
}
}
}
@Override
protected void processTextPosition(TextPosition text) {
String tChar = text.getCharacter();
System.out.println("String[" + text.getXDirAdj() + ","
+ text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale="
+ text.getXScale() + " height=" + text.getHeightDir() + " space="
+ text.getWidthOfSpace() + " width="
+ text.getWidthDirAdj() + "]" + text.getCharacter());
String REGEX = "[,.\\[\\](:;!?)/]";
char c = tChar.charAt(0);
lineMatch = matchCharLine(text);
if ((!tChar.matches(REGEX)) && (!Character.isWhitespace(c))) {
if ((!is1stChar) && (lineMatch == true)) {
appendChar(tChar);
} else if (is1stChar == true) {
setWordCoord(text, tChar);
}
} else {
endWord();
}
}
protected void appendChar(String tChar) {
tWord.append(tChar);
is1stChar = false;
}
protected void setWordCoord(TextPosition text, String tChar) {
tWord.append("(").append(pageNo).append(")[").append(roundVal(Float.valueOf(text.getXDirAdj()))).append(" : ").append(roundVal(Float.valueOf(text.getYDirAdj()))).append("] ").append(tChar);
is1stChar = false;
}
protected void endWord() {
String newWord = tWord.toString().replaceAll("[^\\x00-\\x7F]", "");
String sWord = newWord.substring(newWord.lastIndexOf(' ') + 1);
if (!"".equals(sWord)) {
if (Arrays.asList(seekA).contains(sWord)) {
wordList.add(newWord);
} else if ("SHOWMETHEMONEY".equals(seek)) {
wordList.add(newWord);
}
}
tWord.delete(0, tWord.length());
is1stChar = true;
}
protected boolean matchCharLine(TextPosition text) {
Double yVal = roundVal(Float.valueOf(text.getYDirAdj()));
if (yVal.doubleValue() == lastYVal) {
return true;
}
lastYVal = yVal.doubleValue();
endWord();
return false;
}
protected Double roundVal(Float yVal) {
DecimalFormat rounded = new DecimalFormat("0.0'0'");
Double yValDub = new Double(rounded.format(yVal));
return yValDub;
}
}
依赖关系:
PDFBox的,FontBox,Apache的通用日志接口。
您可以通过输入命令行运行它:
javac PrintTextLocations.java
sudo java PrintTextLocations file.pdf WORD1,WORD2,....
输出是类似于:
[(1)[190.3 : 286.8] WORD1, (1)[283.3 : 286.8] WORD2, ...]
我使用IKVM转换PDFBox.NET 1.8.9这个工作。 在C#和.NET。
我终于想通了的字符(字形)坐标是私有的.NET程序集,但可以使用访问System.Reflection
。
我张贴越来越文字的坐标,并使用SVG和HTML这里把他们拉回到PDF文件的图像的完整的例子: https://github.com/tsamop/PDF_Interpreter
对于下面的例子中,你需要PDFbox.NET: http://www.squarepdf.net/pdfbox-in-net ,并在项目中包含它引用。
我花了相当长一段时间来弄明白,所以我真的希望这样可以节省别人的时间!
如果你只需要知道去哪里找的人物和坐标,一个非常删节版将是:
using System;
using System.Reflection;
using org.apache.pdfbox.pdmodel;
using org.apache.pdfbox.util;
// to test run pdfTest.RunTest(@"C:\temp\test_2.pdf");
class pdfTest
{
//simple example for getting character (gliph) coordinates out of a pdf doc.
// a more complete example is here: https://github.com/tsamop/PDF_Interpreter
public static void RunTest(string sFilename)
{
//probably a better way to get page count, but I cut this out of a bigger project.
PDDocument oDoc = PDDocument.load(sFilename);
object[] oPages = oDoc.getDocumentCatalog().getAllPages().toArray();
int iPageNo = 0; //1's based!!
foreach (object oPage in oPages)
{
iPageNo++;
//feed the stripper a page.
PDFTextStripper tStripper = new PDFTextStripper();
tStripper.setStartPage(iPageNo);
tStripper.setEndPage(iPageNo);
tStripper.getText(oDoc);
//This gets the "charactersByArticle" private object in PDF Box.
FieldInfo charactersByArticleInfo = typeof(PDFTextStripper).GetField("charactersByArticle", BIndingFlags.NonPublic | BindingFlags.Instance);
object charactersByArticle = charactersByArticleInfo.GetValue(tStripper);
object[] aoArticles = (object[])charactersByArticle.GetField("elementData");
foreach (object oArticle in aoArticles)
{
if (oArticle != null)
{
//THE CHARACTERS within the article
object[] aoCharacters = (object[])oArticle.GetField("elementData");
foreach (object oChar in aoCharacters)
{
/*properties I caulght using reflection:
* endX, endY, font, fontSize, fontSizePt, maxTextHeight, pageHeight, pageWidth, rot, str textPos, unicodCP, widthOfSpace, widths, wordSpacing, x, y
*
*/
if (oChar != null)
{
//this is a really quick test.
// for a more complete solution that pulls the characters into words and displays the word positions on the page, try this: https://github.com/tsamop/PDF_Interpreter
//the Y's appear to be the bottom of the char?
double mfMaxTextHeight = Convert.ToDouble(oChar.GetField("maxTextHeight")); //I think this is the height of the character/word
char mcThisChar = oChar.GetField("str").ToString().ToCharArray()[0];
double mfX = Convert.ToDouble(oChar.GetField("x"));
double mfY = Convert.ToDouble(oChar.GetField("y")) - mfMaxTextHeight;
//CALCULATE THE OTHER SIDE OF THE GLIPH
double mfWidth0 = ((Single[])oChar.GetField("widths"))[0];
double mfXend = mfX + mfWidth0; // Convert.ToDouble(oChar.GetField("endX"));
//CALCULATE THE BOTTOM OF THE GLIPH.
double mfYend = mfY + mfMaxTextHeight; // Convert.ToDouble(oChar.GetField("endY"));
double mfPageHeight = Convert.ToDouble(oChar.GetField("pageHeight"));
double mfPageWidth = Convert.ToDouble(oChar.GetField("pageWidth"));
System.Diagnostics.Debug.Print(@"add some stuff to test {0}, {1}, {2}", mcThisChar, mfX, mfY);
}
}
}
}
}
}
}
using System.Reflection;
/// <summary>
/// To deal with the Java interface hiding necessary properties! ~mwr
/// </summary>
public static class GetField_Extension
{
public static object GetField(this object randomPDFboxObject, string sFieldName)
{
FieldInfo itemInfo = randomPDFboxObject.GetType().GetField(sFieldName, BindingFlags.NonPublic | BindingFlags.Instance);
return itemInfo.GetValue(randomPDFboxObject);
}
}