Splitting database column into multivalued Solr fi

2019-08-23 11:55发布

问题:

I'm going nuts trying to figure out how to get the Data Import Handler's splitBy construct to work. I was expecting it to split the input column into a multivalued field. Here's a test case to reproduce the problem:

import java.io.File;
import java.io.IOException;
import java.sql.SQLException;

import static org.junit.Assert.*;

import javax.sql.DataSource;

import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.io.FileUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.core.CoreContainer;
import org.hsqldb.jdbc.JDBCDataSource;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

public class TestSplitBy {

  SolrServer server;
  File configPath = new File(FileUtils.getTempDirectory(), Long.toString(System.nanoTime()));

  String solrconfig_xml = "<config><luceneMatchVersion>LUCENE_41</luceneMatchVersion><requestHandler name=\"search\" class=\"solr.SearchHandler\" default=\"true\"><lst name=\"defaults\"><str name=\"fl\">*</str><str name=\"df\">id</str></lst></requestHandler><requestHandler name=\"/dataimport\" class=\"org.apache.solr.handler.dataimport.DataImportHandler\"><lst name=\"defaults\"><str name=\"config\">data-config.xml</str></lst></requestHandler></config>";

  String data_config_xml = "<dataConfig>" +
      "<dataSource url=\"jdbc:hsqldb:mem:testdb\" user=\"SA\" driver=\"org.hsqldb.jdbc.JDBCDriver\" />" +
      "<document>" +
      "<entity name=\"item\" transformer=\"RegexTransformer\" query=\"SELECT * FROM test\">" +
      "<field column=\"type\" name=\"type\" splitBy=\",\" />" +
      "</entity>" +
      "</document>" +
      "</dataConfig>";

  String schema_xml = "<schema version=\"1.3\" name=\"test\">" +
      "<types>" +
      "<fieldType name=\"string\" class=\"solr.StrField\" sortMissingLast=\"true\" omitNorms=\"true\" />" +
      "</types>" +
      "<fields>" +
      "<field stored=\"true\" name=\"id\" type=\"string\" />" +
      "<field stored=\"true\" name=\"type\" type=\"string\" multiValued=\"true\"/>" +
      "</fields>" +
      "<uniqueKey>id</uniqueKey>" +
      "</schema>";

  DataSource getDataSource() {
    JDBCDataSource ds = new JDBCDataSource();
    ds.setUser("SA");
    ds.setUrl("mem:testdb");
    return ds;
  }

  void populateDb(DataSource ds) {
    QueryRunner runner = new QueryRunner(ds);
    try {
      runner.update("DROP TABLE test IF EXISTS");
      runner.update("CREATE TABLE test(id INTEGER, type VARCHAR(256));");
      runner.update("INSERT INTO test VALUES 1, 'foo,bar,baz'");
    } catch (SQLException e) {
      System.err.println(e);
    }
  }

  void writeSolrConfig() throws IOException {
    File corePath = new File(configPath, "collection1");
    corePath.mkdir();
    File confPath = new File(corePath, "conf");
    confPath.mkdir();
    FileUtils.write(new File(confPath, "data-config.xml"), data_config_xml);
    FileUtils.write(new File(confPath, "schema.xml"), schema_xml);
    FileUtils.write(new File(confPath, "solrconfig.xml"), solrconfig_xml);
  }

  void startSolr() {
    System.setProperty("solr.solr.home", configPath.getAbsolutePath());
    CoreContainer.Initializer initializer = new CoreContainer.Initializer();
    CoreContainer coreContainer = initializer.initialize();
    server = new EmbeddedSolrServer(coreContainer, "collection1");
  }

  @Before
  public void setup() throws IOException {
    populateDb(getDataSource());
    writeSolrConfig();
    startSolr();
  }

  @After
  public void tearDown() {
    server.shutdown();
    FileUtils.deleteQuietly(configPath);
  }

  @Test
  public void testSplitBy() throws Exception {
    SolrQuery query = new SolrQuery();
    query.set("qt", "/dataimport");
    query.setParam("command", "full-import");
    QueryResponse response = server.query(query);
    Thread.sleep(500);

    response = server.query(new SolrQuery("*:*"));
    for (SolrDocument doc: response.getResults()) {
      assertNotNull(doc.getFieldValues("type"));
      assertEquals(3, doc.getFieldValues("type").size());
    }
  }
}

And the POM for the test case:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>org.example</groupId>
  <artifactId>solr</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <name>Solr Sanity</name>

  <build>
    <plugins>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>2.3.2</version>
        <configuration>
          <source>1.6</source>
          <target>1.6</target>
        </configuration>
      </plugin>
    </plugins>
  </build>

  <dependencies>
    <dependency>
      <groupId>org.apache.solr</groupId>
      <artifactId>solr</artifactId>
      <version>4.1.0</version>
      <type>war</type>
    </dependency>
    <dependency>
      <groupId>org.apache.solr</groupId>
      <artifactId>solr-dataimporthandler</artifactId>
      <version>4.1.0</version>
      <type>jar</type>
    </dependency>
    <dependency>
      <groupId>org.apache.solr</groupId>
      <artifactId>solr-solrj</artifactId>
      <version>4.1.0</version>
      <type>jar</type>
    </dependency>
    <dependency>
      <groupId>commons-dbutils</groupId>
      <artifactId>commons-dbutils</artifactId>
      <version>1.5</version>
      <type>jar</type>
    </dependency>
    <dependency>
      <groupId>org.hsqldb</groupId>
      <artifactId>hsqldb</artifactId>
      <version>2.2.9</version>
      <type>jar</type>
      <scope>runtime</scope>
    </dependency>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
    </dependency>
    <dependency>
      <groupId>javax.servlet</groupId>
      <artifactId>servlet-api</artifactId>
      <version>2.5</version>
    </dependency>
  </dependencies>
</project>

Any insight on how to get those types to split correctly into multiple values?

回答1:

Turns out there were a couple of issues with this unit test:

  1. HSQL's column names are case sensitive (and default to upper case).

  2. If the Solr field name and the db column name are identical an extra token with the entire db value is also added.

  3. The field definition should look like:

<field column="solrField" splitBy="," sourceColName="TYPE" />

And in general - when using the RegexTransformer to mix single valued fields from a DB with multivalued fields:

  • If using splitBy then the column attribute is the name of the Solr field. The sourceColName is the database column

  • If not using splitBy then the column attribute is the database column name and the name attribute is the Solr field.



标签: solr dih