I am using PDF Box version 2.0.9
in my application. I have to parse large pdf files from web. Following is the code I am using
MimeDetector Class
@Getter
@Setter
class MimeTypeDetector {
private ByteArrayInputStream byteArrayInputStream;
private BodyContentHandler bodyContentHandler;
private Metadata metadata;
private ParseContext parseContext;
private Detector detector;
private TikaInputStream tikaInputStream;
MimeTypeDetector(ByteArrayInputStream byteArrayInputStream) {
this.byteArrayInputStream = byteArrayInputStream;
this.bodyContentHandler = new BodyContentHandler(-1);
this.metadata = new Metadata();
this.parseContext = new ParseContext();
this.detector = new DefaultDetector();
this.tikaInputStream = TikaInputStream.get(new CloseShieldInputStream(byteArrayInputStream));
}
}
private void crawlAndSave(String url, DomainGroup domainGroup) {
MimeTypeDetector mimeTypeDetector = null;
try {
String decodeUrl = URLDecoder.decode(url, WebCrawlerConstants.UTF_8);
ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(HTMLFetcher.fetch(WebCrawlerUtil.encodeUrl(url)));
mimeTypeDetector = new MimeTypeDetector(byteArrayInputStream);
String contentType = getContentType(mimeTypeDetector);
if (isPDF(contentType)) {
crawlPDFContent(decodeUrl, mimeTypeDetector, domainGroup);
} else if (isWebPage(contentType)) {
// fetching HTML web Page Content
} else {
log.warn("Skipping URL::" + url + ".Not a supported crawler format");
linksVisited.remove(url);
}
} catch (IOException e) {
log.error("crawlAndSave:: Error occurred while decoding URL:" + url + " : " + e.getMessage());
// some catch operation
} finally {
if (Objects.nonNull(mimeTypeDetector)) {
IOUtils.closeQuietly(mimeTypeDetector.getByteArrayInputStream());
}
}
}
private String getContentType(MimeTypeDetector mimeTypeDetector) throws IOException {
TikaInputStream tikaInputStream = mimeTypeDetector.getTikaInputStream();
String contentType = mimeTypeDetector.getDetector().detect(tikaInputStream, mimeTypeDetector.getMetadata()).toString();
tikaInputStream.close();
return contentType;
}
private void crawlPDFContent(String url, MimeTypeDetector mimeTypeDetector, DomainGroup domainGroup) {
try {
private PDFParser pdfParser = new PDFParser();
pdfParser.parse(mimeTypeDetector.getByteArrayInputStream(), mimeTypeDetector.getBodyContentHandler(),
mimeTypeDetector.getMetadata(), mimeTypeDetector.getParseContext());
// Some Database operation
} catch (IOException | TikaException | SAXException e) {
//Some Catch operation
log.error("crawlPDFContent:: Error in crawling PDF Content" + " : " + e.getMessage());
}
}
HTML Fetcher
public class HTMLFetcher {
private HTMLFetcher() {
}
/**
* Fetches the document at the given URL, using {@link URLConnection}.
*
* @param url
* @return
* @throws IOException
*/
public static byte[] fetch(final URL url) throws IOException {
TrustManager[] trustAllCerts = new TrustManager[]{new X509TrustManager() {
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return null;
}
public void checkClientTrusted(X509Certificate[] certs, String authType) {
}
public void checkServerTrusted(X509Certificate[] certs, String authType) {
}
}};
SSLContext sc = null;
try {
sc = SSLContext.getInstance("SSL");
sc.init(null, trustAllCerts, new java.security.SecureRandom());
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
} catch (NoSuchAlgorithmException | KeyManagementException e) {
e.printStackTrace();
}
// Create all-trusting host name verifier
HostnameVerifier allHostsValid = (hostname, session) -> true;
HttpsURLConnection.setDefaultHostnameVerifier(allHostsValid);
setAuthentication(url);
//Taken from Boilerpipe
final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
InputStream in = conn.getInputStream();
byte[] byteArray = IOUtils.toByteArray(in);
in.close();
conn.disconnect();
return byteArray;
}
private static void setAuthentication(URL url) {
AuthenticationDTO authenticationDTO = WebCrawlerUtil.getAuthenticationFromUrl(url);
if (Objects.nonNull(authenticationDTO)) {
Authenticator.setDefault(new Authenticator() {
protected PasswordAuthentication getPasswordAuthentication() {
return new PasswordAuthentication(authenticationDTO.getUserName(),
authenticationDTO.getPassword().toCharArray());
}
});
}
}
}
But when I am checking memory stats, the memory usage is increasing constantly. I verified this using visualVM
and YourKit Java profiler.
Check the attached image.
Is there anything I am doing wrong? I searched for similar issues like this and this but it was mentioned that this issue has been fixed in latest versions.
Please use below while loading document MemoryUsageSetting.setupTempFileOnly()