Arabic caracters in html to pdf using iText

2019-04-16 05:26发布

问题:

I've gone through avery iText topic related to arabic caracters on stackoverflow already, but didn't find an answer for this one. I need to convert an html file in to pdf, but this html contains both english and arabic caracters. Displaying the html in Notepadd++ or in any browser, there is no problem, I can see arabic caracters properly, but when I use the following program to convert into pdf, I can't figure out a way to display arabic caracters, I only get "?" instead :

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.Charset;
import org.apache.commons.io.IOUtils;
import com.itextpdf.text.Document;
import com.itextpdf.text.FontFactory;
import com.itextpdf.text.pdf.BaseFont;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.XMLWorkerHelper;


public class Test2 {

    /**
     * @param args
     */
    public static void main(String[] args) {
        try {
            FileInputStream in = new FileInputStream(new File(
                    "C:\\Test\\test_arabic.html"));
            String k = IOUtils.toString(in, Charset.forName("UTF-8"));
            OutputStream file = new FileOutputStream(new File("C:\\Test\\Test.pdf"));
            Document document = new Document();
            PdfWriter writer = PdfWriter.getInstance(document, file);
            InputStream htmlIn = new ByteArrayInputStream(k.getBytes());
            document.open();
            XMLWorkerHelper helper = XMLWorkerHelper.getInstance();
            FontFactory.getFontImp().registerDirectory("C:\\Windows\\Fonts");
            FontFactory.getFontImp().defaultEncoding = BaseFont.IDENTITY_H;
            helper.parseXHtml(writer, document, htmlIn, in, Charset.forName("UTF-8"),
                    FontFactory.getFontImp());
            document.close();
            file.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

Here is my sample html file :

<html>
<head>
  <meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
  <meta name="language" content="ar-SA" />
  <title>My arabic html</title>
</head>

<body>
<font size="1">

<table width="700" style='font-family:Verdana; font-size:20px; color:blue'>
  <tr>
    <td align="left">ADVICE</td>
    <td dir="rtl" lang="ar-SA"><p align='right' style='font-family:Traditional Arabic;'> إشعار </p></td>
  </tr>
</table>

<table width="700" style='font-size:16px; color:white; background-color:gray'>
  <tr>
    <td align="left">Foreign Exchange</td>
    <td dir="rtl" lang="ar-SA"><p align='right' style='font-family:Traditional Arabic;'> تبادل العملات الأجنبية </p></td>
  </tr>
</table>
</font>
</body>
</html>

Does anyone know how to do that ? I also tried converting my html into a Bytes array using w3c document and iTextRender, but no success.

Edit : I now use the code provided by Vahidn (thanks a lot again) Little complement because I'm still struggling with the alignment now. It seems that the align="left" does not work with arabic and runDirection RTL. Here is my sample html :

<html>
   <head>
      <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
      <meta name="language" content="ar-SA" />
      <title>Confirmation Notice</title>
   </head>
   <body>
      <font size="1">
         <table width="700" style="font-family:Verdana; font-size:20px; color:white; background-color:blue">
            <tr>
               <td width="350" align="right">ADVICE</td>
               <td width="350" align="left" dir="rtl" lang="ar-SA">
                  <p style="font-family:traditional arabic;">
                     <b>إشعار</b>
                  </p>
               </td>
            </tr>
            <tr>
               <td width="350" align="right">Islamic Return Account</td>
               <td width="350" dir="rtl" lang="ar-SA" align="left">
                  <p style="font-family:traditional arabic;">
                     <b>حساب العائد الإسلامي</b>
                  </p>
               </td>
            </tr>
         </table>
    </font>
    </body>
    </html>

But it never aligns on the left the arabic column. align center works though... Any idea ?

Thanks a lot

Thanks for your help

回答1:

I solved this issue using iTextSharp (C# version). Here you can find it: http://www.dotnettips.info/file/userfile?name=XMLWorkerRTLsample.cs

the attached sample needs a little modification as well:

public void Add(IWritable htmlElement)
{
    var writableElement = htmlElement as WritableElement;
    if (writableElement == null)
        return;

    foreach (var element in writableElement.Elements())
    {
        var div = element as PdfDiv;
        if (div != null)
        {
            foreach (var divChildElement in div.Content)
            {
                fixNestedTablesRunDirection(divChildElement);
                _paragraph.Add(divChildElement);
            }
        }
        else
        {
            fixNestedTablesRunDirection(element);
            _paragraph.Add(element);
        }
    }
}


回答2:

Sorry for the delay Samy, here is my code (the htmlString is my example above) :

Charset CHARSET_UTF8 = Charset.forName("UTF-8");

ByteArrayOutputStream baos = null;

    try {
        baos = new ByteArrayOutputStream();
        Document pdfDoc = new Document();
        PdfWriter writer = PdfWriter.getInstance(pdfDoc, baos);
        writer.setRgbTransparencyBlending(true);
        pdfDoc.open();

        StyleAttrCSSResolver cssResolver = new StyleAttrCSSResolver();
        ElementsCollector elementsHandler = new ElementsCollector();
        HtmlPipelineContext htmlContext = new HtmlPipelineContext(new CssAppliersImpl(
                new UnicodeFontProvider()));
        htmlContext.charSet(CHARSET_UTF8);
        htmlContext.setAcceptUnknown(true).autoBookmark(true)
                .setTagFactory(Tags.getHtmlTagProcessorFactory());
        CssResolverPipeline pipeline = new CssResolverPipeline(cssResolver, new HtmlPipeline(htmlContext,
                new ElementHandlerPipeline(elementsHandler, null)));

        XMLWorker worker = new XMLWorker(pipeline, true);
        XMLParser parser = new XMLParser();
        parser.addListener(worker);
        parser.parse(new StringReader(htmlString));

        PdfPTable mainTable = new PdfPTable(1);
        mainTable.setWidthPercentage(100);
        PdfPCell cell = new PdfPCell();
        cell.setBorder(0);
        cell.setHorizontalAlignment(Element.ALIGN_LEFT);
        cell.addElement(elementsHandler.getParagraph());
        mainTable.addCell(cell);

        pdfDoc.add(mainTable);
        pdfDoc.close();

ElementCollector :

import java.util.Iterator;
import java.util.List;

import com.itextpdf.text.Chunk;
import com.itextpdf.text.Element;
import com.itextpdf.text.Font;
import com.itextpdf.text.Paragraph;
import com.itextpdf.text.pdf.PdfPCell;
import com.itextpdf.text.pdf.PdfPRow;
import com.itextpdf.text.pdf.PdfPTable;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.ElementHandler;
import com.itextpdf.tool.xml.Writable;
import com.itextpdf.tool.xml.html.pdfelement.NoNewLineParagraph;
import com.itextpdf.tool.xml.pipeline.WritableElement;

public class ElementsCollector implements ElementHandler {

private Paragraph _paragraph;

public ElementsCollector() {
    _paragraph = new Paragraph();
    _paragraph.setAlignment(Element.ALIGN_LEFT);
}

public Paragraph getParagraph() {
    return _paragraph;
}

@Override
public void add(Writable htmlElement) {
    WritableElement writableElement = (WritableElement) htmlElement;
    if (writableElement == null) {
        return;
    }
    for (Element element : writableElement.elements()) {
        if (element instanceof NoNewLineParagraph) {
            NoNewLineParagraph para = (NoNewLineParagraph) element;
            Iterator<Element> it = para.iterator();
            while (it.hasNext()) {
                Element divChildElement = (Element) it.next();
                fixNestedTablesRunDirection(divChildElement);
                _paragraph.add(divChildElement);
            }
        } else {
            fixNestedTablesRunDirection(element);
            _paragraph.add(element);
        }
    }
}

private void fixNestedTablesRunDirection(Element element) {
    if (element == null) {
        return;
    }
    if (element instanceof PdfPTable) {
        PdfPTable table = (PdfPTable) element;
        for (PdfPRow row : table.getRows()) {
            for (PdfPCell cell : row.getCells()) {
                if (cell.getCompositeElements() != null) {
                    for (Element item : cell.getCompositeElements()) {
                        List<Chunk> chunks = item.getChunks();
                        if (chunks != null) {
                            for (Chunk chunk : chunks) {
                                Font font = chunk.getFont();
                                if (font != null) {
                                    String name = font.getFamilyname() != null ? font.getFamilyname()
                                            .toLowerCase() : null;
                                    if (name != null && !name.isEmpty() && name.contains("arabic")) {
                                        cell.setRunDirection(PdfWriter.RUN_DIRECTION_RTL);
                                        if (item instanceof Paragraph
                                                && ((Paragraph) item).getAlignment() == 2) {
                                            ((Paragraph) item).setAlignment(0);
                                        }
                                        continue;
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}

}

and UnicodeFontProvider :

import java.nio.file.FileSystems;
import java.nio.file.Path;
import java.nio.file.Paths;

import com.itextpdf.text.BaseColor;
import com.itextpdf.text.Font;
import com.itextpdf.text.FontFactory;
import com.itextpdf.text.FontFactoryImp;
import com.itextpdf.text.pdf.BaseFont;


public class UnicodeFontProvider extends FontFactoryImp {

public UnicodeFontProvider() {
    String root = System.getenv("SystemRoot");
    FileSystems.getDefault();
    Path path = Paths.get(root, "fonts");
    FontFactory.getFontImp().registerDirectory(path.toString());
    // TODO test, works on windows so far
}

public Font getFont(String fontname, String encoding, boolean embedded, float size, int style,
        BaseColor color, boolean cached) {
    if (fontname!= null && !fontname.isEmpty()) {
        return new Font(Font.FontFamily.UNDEFINED, size, style, color);
    }
    return FontFactory.getFont(fontname, BaseFont.IDENTITY_H, BaseFont.EMBEDDED, size, style, color);
}

}