I created a program to read and extract text from PDF files... But it producing this exception during execution..
java.io.IOException: Error: Expected a long type, actual='930[299'
at org.apache.pdfbox.pdfparser.BaseParser.readLong(BaseParser.java:1669)
at org.apache.pdfbox.pdfparser.PDFObjectStreamParser.parse(PDFObjectStreamParser.java:100)
at org.apache.pdfbox.cos.COSDocument.dereferenceObjectStreams(COSDocument.java:632)
at org.apache.pdfbox.pdfparser.PDFParser.parse(PDFParser.java:244)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1205)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1172)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1097)
at PatentAdder.main(PatentAdder.java:60)
This is my code :
import java.awt.Rectangle;
import java.io.File;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.commons.io.filefilter.WildcardFileFilter;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.util.PDFTextStripperByArea;
public class PatentAdder {
/**
* @param args
*/
public static String patno,patit,patdate,patfilled,appno;
private static int File;
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
int cnt=0;
if( args.length == 1 )
{
// usage();
}
else
{
PDDocument document = null;
try
{
File dataDir = new File("F:/patents/test/tittest/USP2002w17/06/378/pdfs");
File[] files = dataDir.listFiles();
// String[] files = dataDir.list();
int count=0;
// System.out.println ("Satrt1");
for (File file : files) {
// System.out.println ("Satrt2");
File f = file;
if (!f.isDirectory()) {
document = PDDocument.load(f.getAbsolutePath());
if( document.isEncrypted() )
{
try
{
document.decrypt( "" );
}
catch( InvalidPasswordException e )
{
System.err.println( "Error: Document is encrypted with a password." );
System.exit( 1 );
}
} }
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition( true );
// Rectangle rectt = new Rectangle( 590, 108, 600, 100 ); // enlarge title
Rectangle rectt = new Rectangle( 288, 60, 222, 40 );
Rectangle rect = new Rectangle( 55, 108, 230, 600 ); // US-Patent title h40
// Rectangle rect = new Rectangle( 108, 210, 480, 499 ); //full enlarge
stripper.addRegion( "class1", rect );
stripper.addRegion("class2", rectt);
List allPages = document.getDocumentCatalog().getAllPages();
PDPage firstPage = (PDPage)allPages.get( 0 );
stripper.extractRegions( firstPage );
String title = "(?s)\\(54\\)\\s*([\\w\\s,-]+)|(?s)\\[54\\]\\s*([\\w\\s,-]+)";
String in ="((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\[\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))";
String as ="((?s)\\(\\d\\d\\)\\s+Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Notice:))|((?s)\\(\\d\\d\\)\\s+Assignee:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Notice:))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+)(?=Notice:))";
String app_no ="(?s)\\(21\\)\\s*([\\w\\s,.://-]+)|(?s)\\[21\\]\\s*([\\w\\s,.://-]+)";
String filed ="((?s)\\(22\\)\\s*([\\w\\s,.://-]+))|((?s)\\(22\\)\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))";
String term ="((?s)\\s*Term\\s*([\\w\\s,.://-]+))|((?s)\\s*Term\\s*([\\w\\s,.://-]+))";
String pat_no = "(?s)\\s*Patent No\\.\\:\\s*([\\w\\d\\s,.://-]+)|(?s)\\s*Patent Number\\:\\s*([\\w\\d\\s,.://-]+)";
String pat_dt = "(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventor:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\[\\d*\\]\\s+Inventor:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)";
// System.out.println(rg);
String region = stripper.getTextForRegion( "class1" );
// System.out.println(region);
String regiont = stripper.getTextForRegion( "class2" );
Pattern p = Pattern.compile(in);
Matcher m = p.matcher(region);
Pattern p2 = Pattern.compile(as);
Matcher m2 = p2.matcher(region);
Pattern p3 = Pattern.compile(title);
Matcher m3 = p3.matcher(region);
Pattern p4 = Pattern.compile(pat_no);
Matcher m4 = p4.matcher(regiont);
Pattern p5 = Pattern.compile(app_no);
Matcher m5 = p5.matcher(region);
Pattern p6 = Pattern.compile(filed);
Matcher m6 = p6.matcher(region);
Pattern p7 = Pattern.compile(pat_dt);
Matcher m7 = p7.matcher(regiont);
while(m.find())
{
// System.out.println(m.group());
}
while(m2.find())
{
// System.out.println(m2.group());
}
while(m3.find())
{
// System.out.println(m3.group());
patit = m3.group().replace("(54)", " ");
patit = patit.trim();
}
while(m4.find())
{
// System.out.println(m4.group());
patno = m4.group().replace("Patent No.: ", " ");
patno = patno.replace("Patent No: ", " ");
patno = patno.replace("Patent", " ");
patno = patno.replace("No.:", " ");
patno = patno.replace("No:", " ");
patno = patno.replace("Number: ", " ");
patno = patno.replace("Number.: ", " ");
patno = patno.trim();
}
while(m5.find())
{
// System.out.println(m5.group());
appno = m5.group().replace("(21)", " ");
appno = appno.replace("Appl. No.: ", " ");
appno = appno.replace("Appl.", " ");
appno = appno.replace("No.", " ");
appno = appno.replace(":"," ");
appno = appno.trim();
}
while(m6.find())
{
// System.out.println(m6.group());
patfilled = m6.group().replace("(22)", " ");
patfilled = patfilled.replace("Filed", " ");
patfilled= patfilled.replace("PCT", " ");
patfilled = patfilled.replace(":", " ");
patfilled = patfilled.replace("\n", "");
patfilled= patfilled.trim();
}
while (m7.find())
{
patdate = m7.group().replace("(45) Date of Patent: ", " ");
patdate = patdate.replace("(45) Date of Patent.: ", " ");
patdate = patdate.replace("(45)", " ");
patdate = patdate.replace("Date", " ");
patdate = patdate.replace("of", " ");
patdate = patdate.replace("Patent.: ", " ");
patdate = patdate.replace("Patent: ", " ");
patdate = patdate.replace("Reissued", " ");
patdate = patdate.replace(":", " ");
patdate = patdate.replace("Patent", " ");
patdate = patdate.replace("*", " ");
patdate = patdate.trim();
}
System.out.println("File name:"+f.getName());
System.out.println(patno +"\n"+patit+"\n"+patdate+"\n"+patfilled+"\n"+appno+"\n-------");
// boolean st = addPatent (patno,patit,patdate,patfilled,appno);
// if ( st == true ) System.out.println(patno+" added");
// else System.out.println(patno+" not added");
count++;
}
System.out.print("-----Finised "+count+" Files------ \n");
}
finally
{
if( document != null )
{
document.close();
}
}
}
}
catch (Exception e)
{
System.out.println(e.getStackTrace());
//System.out.println(e.getLocalizedMessage());
System.out.println(e.getMessage());
System.out.println(e.getCause());
//System.out.println(e.getClass());
e.printStackTrace();
}
}
static boolean addPatent(String pno,String ptitle,String pat_date ,String filed_date , String appl_no )
{
int i=0;
boolean status =false;
try {
Class.forName("com.mysql.jdbc.Driver").newInstance();
Connection con = DriverManager.getConnection("jdbc:mysql://localhost:3306/patent", "root","ragesh");
PreparedStatement st = con.prepareStatement("insert into patents_info values (?,?,?,?,?,?)");
st.setString(1, pno);
st.setString(2, ptitle);
st.setString(3,pat_date);
st.setString(4,filed_date);
st.setString(5,appl_no);
st.setInt(6,0);
i=st.executeUpdate();
if (i > 0) status= true;
}
catch (Exception e)
{
e.printStackTrace();
}
return status;
}
public static List<File> getAllChildFiles(File[] dir)
{
List<File> result = new ArrayList<File>();
for (File file : dir)
{
if (file.isDirectory())
{
File[] children = file.listFiles();
List<File> grandChildren = getAllChildFiles(children);
result.addAll(grandChildren);
}
else
{
result.add(file);
}
}
return result;
}
}
This programs gives output up to some iterations , but halts and thorw exception like above specified ..
Sample output with Exception :
File name:06019327.pdf
Number: 6,019,327
[54] INSTALLATION STRUCTURE OF OUTDOOR
COMMUNICATION DRIVE
[45] Feb. 1, 2000
[22] Aug. 30, 1996
Related U.S. Application Data
[21] 08/704,920
-------
File name:06019328.pdf
Number: 6,019,328
[54] STAY-PUT PEGBOARD ACCESSORY
[45] Feb. 1, 2000
[22] Jan. 27, 1999
[21] 09/238,242
-------
File name:06019329.pdf
Number: 6,019,329
[54] CLAMPS
[45] Feb. 1, 2000
[22] Oct. 30, 1997
[21] 08/961,310
-------
File name:06019330.pdf
Number: 6,019,330
[54] ROOF GUARD DEVICE FOR LIFTING
OBJECTS ON TO A ROOF
[45] Feb. 1, 2000
[22] Nov. 20, 1997
[21] 08/974,866
-------
File name:06019331.pdf
Number: 6,019,331
[54] CANTILEVER BRACKET ASSEMBLY
[45] Feb. 1, 2000
[22] May 28, 1997
Related U.S. Application Data
[21] 08/865,587
-------
[Ljava.lang.StackTraceElement;@43a6684f
Error: Expected a long type, actual='930[299'
java.io.IOException: Error: Expected a long type, actual='930[299'
at org.apache.pdfbox.pdfparser.BaseParser.readLong(BaseParser.java:1669)
at org.apache.pdfbox.pdfparser.PDFObjectStreamParser.parse(PDFObjectStreamParser.java:100)
at org.apache.pdfbox.cos.COSDocument.dereferenceObjectStreams(COSDocument.java:632)
at org.apache.pdfbox.pdfparser.PDFParser.parse(PDFParser.java:244)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1205)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1172)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1097)
at PatentAdder.main(PatentAdder.java:60)
2nd Problem
Sometimes the execution freezes.. That is it just showing the blinking cursor after some more iterations .... Why... ?
File name:06019329.pdf
Number: 6,019,329
[54] CLAMPS
[45] Feb. 1, 2000
[22] Oct. 30, 1997
[21] 08/961,310
-------
File name:06019330.pdf
Number: 6,019,330
[54] ROOF GUARD DEVICE FOR LIFTING
OBJECTS ON TO A ROOF
[45] Feb. 1, 2000
[22] Nov. 20, 1997
[21] 08/974,866
-------
File name:06019331.pdf
Number: 6,019,331
[54] CANTILEVER BRACKET ASSEMBLY
[45] Feb. 1, 2000
[22] May 28, 1997
Related U.S. Application Data
[21] 08/865,587
-------
(__ cursor blinks on... and execution freezes )
Please help me to resolve this 2 issues:
JDK version : 1.6 PDF Box 1.8.3