Store hindi words in MYSQL using JAVA web crawler

2019-08-28 22:53发布

问题:

I want to store some Hindi words in a MySQL database. For that I wrote a web crawler. I am able to read those words successfully from HTML page and display them in the NetBeans console. But when I insert them in MySQL they change to ???????. Also if I insert the same words using SQL query in PHPMyAdmin itself, they get stored properly.

I have searched Google and various forums a lot and have taken proper precautions in handling Unicode in most places. Do we have to mention explicitly in the SQL statements (JDBC) if entering Unicode?

Here is my whole code.

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;
import java.util.*;

public class TestDataParsing2 {
  public int counter = 1;
  private String ID = "";
  private String title = "";
  private String owner = "";
  private String s="";

  private Connection conn = null;
  private String url = "jdbc:mysql://localhost:3306";
  private String dbName = "/hindi-eng";
  private String driver = "com.mysql.jdbc.Driver";
  private String userName = "root";
  private String password = "";
  private String TABLE = "dict";

private void initdb(){
  try {
    Class.forName(driver).newInstance();
    conn = DriverManager.getConnection(url+dbName,userName,password);
  } catch (Exception e) {
    e.printStackTrace();
  }
}

private void closedb(){
  try {
    conn.close();
  } catch (Exception e) {
    e.printStackTrace();
  }
}
public void process(String content){
  try{
    BufferedReader reader =  new BufferedReader(new StringReader(content));
    String text = "";
    boolean start1 = false;
    boolean start2 = false;
    while (( text = reader.readLine()) != null) {
      if(text.contains("\"a")) {
        System.out.println("______________________________________________________________");
        String id = getID(text);
        this.ID = id;
        String title = getTitle(text);
        this.title = title;
        String owner = getOwner(text);
        this.owner = owner;
        start1 = true;
      }
      if(start1 && start2) {
        String s = getS(text);
        this.s = s;
        counter++;
        insert();
        start2=false;
        start1= false;
      }
      if(start1) {
        start2= true;
      }
    }
  }catch(Exception e){
    System.out.println(e);
  }
}

public void insert(){
  String insertString = "INSERT INTO " + TABLE + " VALUES (" + this.counter + ",'" +
    this.ID + "','" + this.title + "','" + this.owner + "','" + this.s + "')";
  System.out.println(insertString);
  try {
    Statement stmt = conn.createStatement();
    stmt.executeUpdate(insertString);
    stmt.close();
  } catch(Exception e) {
    System.out.println(e);
  }
}

public String getID(String text){
  String id = "";
  id = text.substring(text.indexOf("\"")+1, text.indexOf("\","));
  return id;
}

public String getTitle(String text){
  String title = "";
  title = text.substring(text.indexOf(",\"")+2, text.indexOf("\",\"1."));
  return title;
}

public String getOwner(String text){
  try{
    String owner = "";
    owner = text.substring(text.indexOf("\",\"1.")+5, text.indexOf("\"<br>"));
    int i;
    for(i=0;i<owner.length();i++) {
      String fifthChar = "\u00AE";
      int codePoint = owner.codePointAt(i);
    }
    return owner;
  } catch(Exception e) {
    System.out.println(e);
    System.out.println("eeee");
  }
  return owner;
}

public String getS(String text){
  String s = "";
  s = text.substring(0, text.indexOf("<br>"));
  return s;
}

public String download(String path) {
  String result = "";
  try {
    URL url = new URL(path);
    URLConnection conn = url.openConnection();
    conn.setDoOutput(true);
    InputStream in = null;
    in = url.openStream();
    String content = pipe(in,"utf-8");
    result = content;
  } catch (Exception e) {
    e.printStackTrace();
  }
  return result;
}

public String pipe(InputStream in,String charset) throws IOException {
  StringBuffer s = new StringBuffer();
  if(charset==null||"".equals(charset)){
    charset="utf-8";
  }
  String rLine = null;
  BufferedReader bReader = new BufferedReader(new InputStreamReader(in,"UTF-8"));
  FileOutputStream("C:\\Research\\MiningSoftwareRepositories\\Traceability-Link-Recovery\\EXPERIMENTS\\BR\\"
    + bugid + ".txt");
  while ( (rLine = bReader.readLine()) != null) {
    String tmp_rLine = rLine;
    s.append(tmp_rLine+"\n");
  }
  tmp_rLine = null;
}
  in.close();
  return s.toString();
}

public static void main(String[] args) {
  TestDataParsing2 tdp = new TestDataParsing2();
  tdp.initdb();
  System.out.println("process started");
  String urlPath = "file:///C:/Users/Abhinav/Downloads/Compressed/eng-hindi-dict-utf8/sa.htm";
  String content = tdp.download(urlPath);
  tdp.process(content);
  tdp.closedb();
}

Also when I add "?useUnicode=yes&characterEncoding=UTF-8" to the conn_url I get the following errors which were not coming without it.

java.sql.SQLException: Unsupported character encoding 'UTF-8/hindi-eng'. process started at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:1074) at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:988) at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:974) at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:919) at com.mysql.jdbc.StringUtils.getBytes(StringUtils.java:574) at com.mysql.jdbc.StringUtils.getBytes(StringUtils.java:719) at com.mysql.jdbc.Buffer.writeStringNoNull(Buffer.java:704) at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2573) at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2713) at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2663) at com.mysql.jdbc.StatementImpl.executeQuery(StatementImpl.java:1599) at com.mysql.jdbc.ConnectionImpl.loadServerVariables(ConnectionImpl.java:3928) at com.mysql.jdbc.ConnectionImpl.initializePropsFromServer(ConnectionImpl.java:3473) at com.mysql.jdbc.ConnectionImpl.connectOneTryOnly(ConnectionImpl.java:2445) at com.mysql.jdbc.ConnectionImpl.createNewIO(ConnectionImpl.java:2215) at com.mysql.jdbc.ConnectionImpl.(ConnectionImpl.java:813) at com.mysql.jdbc.JDBC4Connection.(JDBC4Connection.java:47) at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57) at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) at java.lang.reflect.Constructor.newInstance(Constructor.java:525) at com.mysql.jdbc.Util.handleNewInstance(Util.java:411) at com.mysql.jdbc.ConnectionImpl.getInstance(ConnectionImpl.java:399) at com.mysql.jdbc.NonRegisteringDriver.connect(NonRegisteringDriver.java:334) at java.sql.DriverManager.getConnection(DriverManager.java:579) at java.sql.DriverManager.getConnection(DriverManager.java:221) at TestDataParsing2.initdb(TestDataParsing2.java:29) at TestDataParsing2.main(TestDataParsing2.java:239)


INSERT INTO dict VALUES (2,'a','Art','एक','I bought a pen.') java.lang.NullPointerException __________________________________ INSERT INTO dict VALUES (3,'aback','Adv','पीछे/हतप्रभ','I was somewhat taken aback by his rudeness.') java.lang.NullPointerException

回答1:

You don't specify the database connection encoding, so the server default encoding gets used. It looks like the server is not configured to use UTF-8.

You can either set the server encoding to UTF-8, or set the characterEncoding property on the connection:

conn = DriverManager.getConnection(url+dbName+"?characterEncoding=UTF-8",userName,password);

Remember that the syntax of the connection url is "jdbc:mysql://host:port/database?option1=value1&option2=value2&...".