Java file encoding conversion

2019-05-07 04:40发布

I have a requirement to change the encoding of a file from ANSI(windows-1252) to UTF8. I wrote below program to do it through java. This program converts the characters to UTF8, but when I opened the file in notepade++ the encoding type was displayed as ANSI as UTF8. This gives me error when I import this file in access db. A file with UTF8 encoding only is desired. Also the requirement is to convert the file without opening it in any editor.

public class ConvertFromAnsiToUtf8 {

private static final char BYTE_ORDER_MARK = '\uFEFF';
private static final String ANSI_CODE = "windows-1252";
private static final String UTF_CODE = "UTF8";
private static final Charset ANSI_CHARSET = Charset.forName(ANSI_CODE);

public static void main(String[] args) {

    List<File> fileList;
    File inputFolder = new File(args[0]);
    if (!inputFolder.isDirectory()) {
        return;
    }
    File parentDir = new File(inputFolder.getParent() + "\\"
                    + inputFolder.getName() + "_converted");

    if (parentDir.exists()) {
        return;
    }
    if (parentDir.mkdir()) {

    } else {
        return;
    }

    fileList = new ArrayList<File>();
    for (final File fileEntry : inputFolder.listFiles()) {
        fileList.add(fileEntry);
    }

    InputStream in;

    Reader reader = null;
    Writer writer = null;
    try {
        for (File file : fileList) {
            in = new FileInputStream(file.getAbsoluteFile());
            reader = new InputStreamReader(in, ANSI_CHARSET);

            OutputStream out = new FileOutputStream(
                            parentDir.getAbsoluteFile() + "\\"
                                            + file.getName());
            writer = new OutputStreamWriter(out, UTF_CODE);
            writer.write(BYTE_ORDER_MARK);
            char[] buffer = new char[10];
            int read;
            while ((read = reader.read(buffer)) != -1) {
                System.out.println(read);
                writer.write(buffer, 0, read);
            }
        }
        reader.close();
        writer.close();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}

}

Any pointers will be helpful.

Thanks, Ashish

2条回答
在下西门庆
2楼-- · 2019-05-07 05:23

The posted code correctly transcodes from windows-1252 to UTF-8.

The Notepad++ message is confusing because "ANSI as UTF-8" has no obvious meaning; it appears to be an open defect in Notepad++. I believe Notepad++ means UTF-8 without BOM (see the encoding menu.)

Microsoft Access, being a Windows program, probably expects UTF-8 files to start with a byte-order-mark (BOM).

You can inject a BOM into the document by writing the code point U+FEFF at the start of the file:

import java.io.*;
import java.nio.charset.*;

public class Ansi1252ToUtf8 {
  private static final char BYTE_ORDER_MARK = '\uFEFF';

  public static void main(String[] args) throws IOException {
    Charset windows1252 = Charset.forName("windows-1252");
    try (InputStream in = new FileInputStream(args[0]);
        Reader reader = new InputStreamReader(in, windows1252);
        OutputStream out = new FileOutputStream(args[1]);
        Writer writer = new OutputStreamWriter(out, StandardCharsets.UTF_8)) {
      writer.write(BYTE_ORDER_MARK);
      char[] buffer = new char[1024];
      int read;
      while ((read = reader.read(buffer)) != -1) {
        writer.write(buffer, 0, read);
      }
    }
  }
}
查看更多
唯我独甜
3楼-- · 2019-05-07 05:27

On Windows 7 (64-Bit), running Java 8, I had to close every file. Otherwise, files get truncated to multiples of 4 kB. It is not enough to close the last set of files, I had to close every file to get the desired result. Posting my adapted version that adds error messages:

import java.io.*;
import java.nio.charset.*;
import java.util.ArrayList;

public class ConvertFromAnsiToUtf8 {

    private static final char BYTE_ORDER_MARK = '\uFEFF';
    private static final String ANSI_CODE = "windows-1252";
    private static final String UTF_CODE = "UTF8";
    private static final Charset ANSI_CHARSET = Charset.forName(ANSI_CODE);
    private static final String PATH_SEP = "\\";
    private static final boolean WRITE_BOM = false;

    public static void main(String[] args) 
    {
        if (args.length != 2) {
            System.out.println("Please name a source and a target directory");
            return;
        }

        File inputFolder = new File(args[0]);
        if (!inputFolder.isDirectory()) {
            System.out.println("Input folder " + inputFolder + " does not exist");
            return;
        }
        File outputFolder = new File(args[1]);

        if (outputFolder.exists()) {
            System.out.println("Folder " + outputFolder + " exists - aborting");
            return;
        }
        if (outputFolder.mkdir()) {
            System.out.println("Placing converted files in " + outputFolder);
        } else {
            System.out.println("Output folder " + outputFolder + " exists - aborting");
            return;
        }

        ArrayList<File> fileList = new ArrayList<File>();
        for (final File fileEntry : inputFolder.listFiles()) {
            fileList.add(fileEntry);
        }

        InputStream in;
        Reader reader = null;
        Writer writer = null;
        int converted = 0;

        try {
            for (File file : fileList) {
                try {
                    in = new FileInputStream(file.getAbsoluteFile());
                    reader = new InputStreamReader(in, ANSI_CHARSET);

                    OutputStream out = new FileOutputStream(outputFolder.getAbsoluteFile() + PATH_SEP + file.getName());
                    writer = new OutputStreamWriter(out, UTF_CODE);

                    if (WRITE_BOM)
                        writer.write(BYTE_ORDER_MARK);
                    char[] buffer = new char[1024];
                    int read;
                    while ((read = reader.read(buffer)) != -1) {
                        writer.write(buffer, 0, read);
                    }
                    ++converted;
                } finally {
                    reader.close();
                    writer.close();
                }
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

        System.out.println(converted + " files converted");
    }

}
查看更多
登录 后发表回答