Trying to get java to read lucene index created wi

2019-05-26 18:54发布

问题:

I've got a lucene index that I created with solr. The lucene version is 3.6.1.

I found a java program on the web that reads a lucene index:

http://www.javacodegeeks.com/2010/05/introduction-to-apache-lucene-for-full.html

I modified the program for my local environment but it always tells me that no hits are found for a query which has results in the index. After having no luck with the program I modified the code to use StandardAnalyzer instead of SimpleAnalyzer. No luck.

Here's the code:

package com.javacodegeeks.lucene;

import java.io.File;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class StandardSearcher {

    public static void main(String[] args) throws Exception {

        File indexDir = new File("/path/to/solr/data/index/");
        String query = "science";
        int hits = 100;

        StandardSearcher searcher = new StandardSearcher();
        searcher.searchIndex(indexDir, query, hits);

    }

    private void searchIndex(File indexDir, String queryStr, int maxHits)
        throws Exception {

        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);

        Directory directory = FSDirectory.open(indexDir);

        IndexSearcher searcher = new IndexSearcher(directory);
        Query query = new QueryParser(Version.LUCENE_36, "title", analyzer).parse(queryStr);

        TopDocs topDocs = searcher.search(query, maxHits);

        ScoreDoc[] hits = topDocs.scoreDocs;
        for (int i = 0; i < hits.length; i++) {
            int docId = hits[i].doc;
            Document d = searcher.doc(docId);
            System.out.println(d.get("filename"));
        }

        System.out.println("Found " + hits.length);

    }

}

What am I doing wrong? Looking through solrconfig.xml I can't tell which analyzer solr uses by default. That's why I tried both SimpleAnalyzer and StandardAnalyzer.

Suggestions on how to debug this would be greatly appreciated.

Update: Here are the fields in my schema:

<field name="metaDataUrl" type="string" indexed="true" stored="true" required="true"/>
<field name="title" type="text" stored="true" indexed="true"/>
<field name="snippet" type="text" indexed="true" stored="true"/>
<field name="rest" type="string" stored="true" indexed="false" multiValued="true"/>
<field name="date_indexed" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
<field name="all" type="text" stored="false" indexed="true" multiValued="true"/>

And, here's the XML for fieldType text from schema.xml:

<!-- A text field that uses WordDelimiterFilter to enable splitting and matching of                                                                                                             
    words on case-change, alpha numeric boundaries, and non-alphanumeric chars,                                                                                                                 
    so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".                                                                                                             
    Synonyms and stopwords are customized by external files, and stemming is enabled.                                                                                                           
    -->
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
  <analyzer type="index">
    <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    <!-- in this example, we will only use synonyms at query time                                                                                                                               
    <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>                                                                                  
    -->
    <!-- Case insensitive stop word removal.                                                                                                                                                    
      add enablePositionIncrements=true in both the index and query                                                                                                                             
      analyzers to leave a 'gap' for more accurate phrase queries.                                                                                                                              
    -->
    <filter class="solr.StopFilterFactory"
            ignoreCase="true"
            words="stopwords.txt"
            enablePositionIncrements="true"
            />
    <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
    <filter class="solr.LowerCaseFilterFactory"/>
    <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
  </analyzer>
  <analyzer type="query">
    <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
    <filter class="solr.StopFilterFactory"
            ignoreCase="true"
            words="stopwords.txt"
            enablePositionIncrements="true"
            />
    <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
    <filter class="solr.LowerCaseFilterFactory"/>
    <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
  </analyzer>
</fieldType>

回答1:

You need to build your custom analyzer using the tokenizer and filters used while indexing (as defined in the index part of fieldType xml). Pass that custom analyzer as parameter to searcher and then search should work fine. Does SnowballPorterFilter stem "science"? may be ..

Refer to http://whiteboardjunkie.wordpress.com/tag/custom-analyzer/ for details on building your custom analyzer. You just need to call one filter after another in the tokenstream()

Also, you can examine the index using luke (http://code.google.com/p/luke/) and see if there are any documents containing "science" in title field at all.



回答2:

A co-worker changed my code slightly to look like the code below. He also suggested I search for word stems. This approach worked and I now get results from searches against that solr built Lucene index. This code still needs work but I'm posting it as a proof-of-concept that I hope will be useful to others.

import java.io.File;

import java.util.List;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.Field;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class SimpleSearcher {

    public static void main(String[] args) throws Exception {

        File indexDir = new File("/path/to/solr/data/index/");
        int hits = 100;

        SimpleSearcher searcher = new SimpleSearcher();
        searcher.searchIndex(indexDir, args[0], hits);
    }

    private void searchIndex(File indexDir, String queryStr, int maxHits)
            throws Exception {

        Directory directory = FSDirectory.open(indexDir);

        IndexSearcher searcher = new IndexSearcher(directory);
        QueryParser parser = new QueryParser(Version.LUCENE_35,
         "title", new SimpleAnalyzer());
        Query query = parser.parse(queryStr);

        TopDocs topDocs = searcher.search(query, maxHits);

        ScoreDoc[] hits = topDocs.scoreDocs;
        for (int i = 0; i < hits.length; i++) {
            int docId = hits[i].doc;
            Document d = searcher.doc(docId);
            List<Fieldable> fields = d.getFields();

            System.out.println( (i+1) + ". ==========================================================");
            for ( Fieldable field : fields ) {
               if (field.isStored()) {
                 System.out.println(" >> " + field.name() + " - " + d.get(field.name()));
               }
            }
        }

        System.out.println("Found " + hits.length);
    }
}


标签: solr lucene