I have got some codes from online and they are providing me the font sizes. I did not understand how the TextRenderInfo is reading text. I tried with renderInfo.GetText()) which is giving random number of characters, sometimes 3 characters, sometimes 2 characters or more or less. I need to know how the renderInfo is reading data ?
My intention is to separate every lines and paragraphs from pdf and also read their properties individually such as font size, font style etc. If you have any suggestion, please mention them.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using iTextSharp.text.pdf.parser;
using iTextSharp.text.pdf;
namespace FontSizeDig1
{
class Program
{
static void Main(string[] args)
{
// reader ==> http://itextsupport.com/apidocs/itext5/5.5.9/com/itextpdf/text/pdf/PdfReader.html#pdfVersion
PdfReader reader = new PdfReader(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "document.pdf"));
TextWithFontExtractionStategy S = new TextWithFontExtractionStategy();//strategy==> http://itextsupport.com/apidocs/itext5/5.5.9/com/itextpdf/text/pdf/parser/TextExtractionStrategy.html
// for (int i = 1; i <= reader.NumberOfPages; i++)
// {
string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1/*i*/, S);
// PdfTextExtractor.GetTextFromPage(reader, 6, S) ==>> http://itextsupport.com/apidocs/itext5/5.5.9/com/itextpdf/text/pdf/parser/PdfTextExtractor.html
Console.WriteLine(F);
// }
Console.ReadKey();
//this.Close();
}
}
public class TextWithFontExtractionStategy : iTextSharp.text.pdf.parser.ITextExtractionStrategy
{
//HTML buffer
private StringBuilder result = new StringBuilder();
//Store last used properties
private Vector lastBaseLine;
private string lastFont;
private float lastFontSize;
//http://api.itextpdf.com/itext/com/itextpdf/text/pdf/parser/TextRenderInfo.html
private enum TextRenderMode
{
FillText = 0,
StrokeText = 1,
FillThenStrokeText = 2,
Invisible = 3,
FillTextAndAddToPathForClipping = 4,
StrokeTextAndAddToPathForClipping = 5,
FillThenStrokeTextAndAddToPathForClipping = 6,
AddTextToPaddForClipping = 7
}
public void RenderText(iTextSharp.text.pdf.parser.TextRenderInfo renderInfo)
{
string curFont = renderInfo.GetFont().PostscriptFontName; // http://itextsupport.com/apidocs/itext5/5.5.9/com/itextpdf/text/pdf/parser/TextRenderInfo.html#getFont--
//Check if faux bold is used
if ((renderInfo.GetTextRenderMode() == 2/*(int)TextRenderMode.FillThenStrokeText*/))
{
curFont += "-Bold";
}
//This code assumes that if the baseline changes then we're on a newline
Vector curBaseline = renderInfo.GetBaseline().GetStartPoint();
Vector topRight = renderInfo.GetAscentLine().GetEndPoint();
iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(curBaseline[Vector.I1], curBaseline[Vector.I2], topRight[Vector.I1], topRight[Vector.I2]);
Single curFontSize = rect.Height;
//See if something has changed, either the baseline, the font or the font size
if ((this.lastBaseLine == null) || (curBaseline[Vector.I2] != lastBaseLine[Vector.I2]) || (curFontSize != lastFontSize) || (curFont != lastFont))
{
//if we've put down at least one span tag close it
if ((this.lastBaseLine != null))
{
this.result.AppendLine("</span>");
}
//If the baseline has changed then insert a line break
if ((this.lastBaseLine != null) && curBaseline[Vector.I2] != lastBaseLine[Vector.I2])
{
this.result.AppendLine("<br />");
}
//Create an HTML tag with appropriate styles
this.result.AppendFormat("<span style=\"font-family:{0};font-size:{1}\">", curFont, curFontSize);
}
//Append the current text
this.result.Append(renderInfo.GetText());
Console.WriteLine("me=" + renderInfo.GetText());//by imtiaj
//Set currently used properties
this.lastBaseLine = curBaseline;
this.lastFontSize = curFontSize;
this.lastFont = curFont;
}
public string GetResultantText()
{
//If we wrote anything then we'll always have a missing closing tag so close it here
if (result.Length > 0)
{
result.Append("</span>");
}
return result.ToString();
}
//Not needed
public void BeginTextBlock() { }
public void EndTextBlock() { }
public void RenderImage(ImageRenderInfo renderInfo) { }
}
}