Custom “tab” Tokenizer in ElasticSearch NEST 2.4

2019-09-19 02:01发布

问题:

I have an index with many fields, and one field "ServiceCategories" has data similar to this:

|Case Management|Developmental Disabilities

I need to break up the data by the separator "|" and I have attempted to do so with this:

    var descriptor = new CreateIndexDescriptor(_DataSource.ToLower())
        .Mappings(ms => ms
            .Map<ProviderContent>(m => m
                .AutoMap()
                .Properties(p => p
                    .String(s => s
                        .Name(n => n.OrganizationName)
                        .Fields(f => f
                            .String(ss => ss.Name("raw").NotAnalyzed())))
                    .String(s => s
                        .Name(n => n.ServiceCategories)
                        .Analyzer("tab_delim_analyzer"))
                    .GeoPoint(g => g.Name(n => n.Location).LatLon(true)))))
        .Settings(st => st
            .Analysis(an => an
                .Analyzers(anz => anz
                    .Custom("tab_delim_analyzer", td => td
                        .Filters("lowercase")
                    .Tokenizer("tab_delim_tokenizer")))
                .Tokenizers(t => t
                    .Pattern("tab_delim_tokenizer", tdt => tdt
                        .Pattern("|")))));
    _elasticClientWrapper.CreateIndex(descriptor);

My search code for ServiceCategories (serviceCategories to ES) uses a simple TermQuery with the value set to lower case.

It's not getting results using this search parameter (the others work fine). Expected results are to get exact matches on at least one term from the above.

I have attempted to get it working by using a classic tokenizer as well:

    var descriptor = new CreateIndexDescriptor(_DataSource.ToLower())
        .Mappings(ms => ms
            .Map<ProviderContent>(m => m
                .AutoMap()
                .Properties(p => p
                    .String(s => s
                        .Name(n => n.OrganizationName)
                        .Fields(f => f
                            .String(ss => ss.Name("raw").NotAnalyzed())))
                    .String(s => s
                        .Name(n => n.ServiceCategories)
                        .Analyzer("classic_tokenizer")
                        .SearchAnalyzer("standard"))
                    .GeoPoint(g => g.Name(n => n.Location).LatLon(true)))))
        .Settings(s => s
            .Analysis(an => an
                .Analyzers(a => a.Custom("classic_tokenizer", ca => ca
                    .Tokenizer("classic")))));

This isn't working either. Can anyone help me identify where I am going wrong?

Here's the search request:

### ES REQEUST ###
{
  "from": 0,
  "size": 10,
  "sort": [
    {
      "organizationName": {
        "order": "asc"
      }
    }
  ],
  "query": {
    "bool": {
      "must": [
        {
          "match_all": {}
        },
        {
          "term": {
            "serviceCategories": {
              "value": "developmental disabilities"
            }
          }
        }
      ]
    }
  }
}

回答1:

Your pattern for tab_delim_tokenizer is close, but not quite correct :) The easiest way to see this is to use the Analyze API to understand how an Analyzer will tokenize a piece of text. With your first mapping in place, we can check what the custom analyzer does

client.Analyze(a => a
    .Index(_DataSource.ToLower())
    .Analyzer("tab_delim_analyzer")
    .Text("|Case Management|Developmental Disabilities")
);

which returns (snipped for brevity)

{
  "tokens" : [ {
    "token" : "|",
    "start_offset" : 0,
    "end_offset" : 1,
    "type" : "word",
    "position" : 0
  }, {
    "token" : "c",
    "start_offset" : 1,
    "end_offset" : 2,
    "type" : "word",
    "position" : 1
  }, {
    "token" : "a",
    "start_offset" : 2,
    "end_offset" : 3,
    "type" : "word",
    "position" : 2
  }, {
    "token" : "s",
    "start_offset" : 3,
    "end_offset" : 4,
    "type" : "word",
    "position" : 3
  }, ... ]
}

demonstrating that the tab_delim_tokenizer is not tokenizing how we expect. A small change fixes this by escaping the | in the pattern with \ and making the pattern a verbatim string literal by prefixing with @.

Here's a complete example

void Main()
{
    var pool = new SingleNodeConnectionPool(new Uri("http://localhost:9200"));
    var defaultIndex = "default-index";
    var connectionSettings = new ConnectionSettings(pool)
            .DefaultIndex(defaultIndex);

    var client = new ElasticClient(connectionSettings);

    if (client.IndexExists(defaultIndex).Exists)
        client.DeleteIndex(defaultIndex);

    var descriptor = new CreateIndexDescriptor(defaultIndex)
        .Mappings(ms => ms
            .Map<ProviderContent>(m => m
                .AutoMap()
                .Properties(p => p
                    .String(s => s
                        .Name(n => n.OrganizationName)
                        .Fields(f => f
                            .String(ss => ss.Name("raw").NotAnalyzed())))
                    .String(s => s
                        .Name(n => n.ServiceCategories)
                        .Analyzer("tab_delim_analyzer")
                    )
                    .GeoPoint(g => g
                        .Name(n => n.Location)
                        .LatLon(true)
                    )
                )
            )
        )
        .Settings(st => st
            .Analysis(an => an
                .Analyzers(anz => anz
                    .Custom("tab_delim_analyzer", td => td
                        .Filters("lowercase")
                        .Tokenizer("tab_delim_tokenizer")
                    )
                )
                .Tokenizers(t => t
                    .Pattern("tab_delim_tokenizer", tdt => tdt
                        .Pattern(@"\|")
                    )
                )
            )
        );

    client.CreateIndex(descriptor);

    // check our custom analyzer does what we think it should
    client.Analyze(a => a
        .Index(defaultIndex)
        .Analyzer("tab_delim_analyzer")
        .Text("|Case Management|Developmental Disabilities")
    );

    // index a document and make it immediately available for search
    client.Index(new ProviderContent
    {   
        OrganizationName = "Elastic",
        ServiceCategories = "|Case Management|Developmental Disabilities"
    }, i => i.Refresh());


    // search for our document. Use a term query in a bool filter clause
    // as we don't need scoring (probably)
    client.Search<ProviderContent>(s => s
        .From(0)
        .Size(10)
        .Sort(so => so
            .Ascending(f => f.OrganizationName)
        )
        .Query(q => +q
            .Term(f => f.ServiceCategories, "developmental disabilities")          
        )
    );

}

public class ProviderContent
{
    public string OrganizationName { get; set; }

    public string ServiceCategories { get; set; }

    public GeoLocation Location { get; set; }
}

the search results return

{
  "took" : 2,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 1,
    "max_score" : null,
    "hits" : [ {
      "_index" : "default-index",
      "_type" : "providercontent",
      "_id" : "AVqNNqlQpAW_5iHrnIDQ",
      "_score" : null,
      "_source" : {
        "organizationName" : "Elastic",
        "serviceCategories" : "|Case Management|Developmental Disabilities"
      },
      "sort" : [ "elastic" ]
    } ]
  }
}