为什么使用非拉丁文本的 Lucene 模糊搜索更容易导致 TooComplexToDeterminizeException？

Cru*_*ool 7 java lucene internationalization

我有以下测试用例，我在其中执行模糊查询。两种情况都插入了一些“Lorem Ipsum”文本，但一种使用拉丁字母，而另一种使用西里尔字母。

但是，西里尔文测试用例抛出异常

org.apache.lucene.util.automaton.TooComplexToDeterminizeException：确定具有 34479 个状态和 58454 个转换的自动机将导致超过 10000 个状态。

而拉丁文没有，即使它包含更多的字符和单词。

如果我将西里尔字母的最大编辑距离减少到1，它就可以正常工作。但我想知道为什么这个问题首先出现在一个字母表而不是另一个字母表中。我也注意到与其他字母表类似的行为，如梵文。

无论输入语言如何，我都可以做些什么来避免这些问题，但同时仍保持最大编辑距离为2？

该测试使用 JUnit 4.12 和 Lucene Core + Common Analyzers 8.3.0 编写。

import static org.junit.Assert.assertEquals;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.junit.Test;

public class LuceneTest
{
  private static final String ID_FIELD = "ID";
  private static final String NAME_FIELD = "NAME";

  @Test
  public void testLatin() throws IOException
  {
    Analyzer analyzer = new StandardAnalyzer();

    String latin = "Lorem ipsum dolor sit amet, has ex bonorum scripserit,"
        + " graeco volutpat aliquando ut eum, nec partem evertitur maiestatis ea."
        + " Ad vim aliquam dignissim. Eu audire delectus eum, vel eu viderer democritum voluptatum."
        + " Eu ludus utinam fabulas vim, mei te accumsan conceptam, at sea possit aperiam tacimates."
        + " Sed periculis repudiare ut."
        + " Lorem ipsum dolor sit amet, has ex bonorum scripserit,"
        + " graeco volutpat aliquando ut eum, nec partem evertitur maiestatis ea."
        + " Ad vim aliquam dignissim. Eu audire delectus eum, vel eu viderer democritum voluptatum."
        + " Eu ludus utinam fabulas vim, mei te accumsan conceptam, at sea possit aperiam tacimates."
        + " Sed periculis repudiare ut.";

    // This completes successfully.
    runLuceneTest( latin, latin, analyzer, 2 );
  }

  @Test
  public void testCyrillic() throws IOException
  {
    Analyzer analyzer = new RussianAnalyzer();

    String cyrillic = "????? ????? ????? ??? ????, ????? ?????? ??????? ?? ???, ?? ??? ?????? ??????? ???????."
        + " ????? ????????????? ??? ??, ?? ????? ?????? ?????????? ???, ????? ?????? ??? ??."
        + " ?? ??? ????? ?????? ?????????.";

    // This throws an execption with edit distance 2.
    runLuceneTest( cyrillic, cyrillic, analyzer, 2 );
  }

  private void runLuceneTest( String insertText, String searchText, Analyzer analyzer, int maxEditDistance )
      throws IOException
  {
    Directory index = new RAMDirectory();
    IndexWriterConfig config = new IndexWriterConfig( analyzer );

    try ( IndexWriter w = new IndexWriter( index, config ) )
    {
      w.addDocument( createDoc( 42, insertText ) );
      w.commit();
    }

    List<Document> result = search( index, searchText, maxEditDistance );
    assertEquals( "Number of results", 1, result.size() );
    assertEquals( "Document ID", 42, Integer.parseInt( result.get( 0 ).getField( ID_FIELD ).stringValue() ) );
  }

  private Document createDoc( long id, String name )
  {
    Document doc = new Document();
    doc.add( new TextField( ID_FIELD, Long.toString( id ), Store.YES ) );
    doc.add( new StringField( NAME_FIELD, name, Store.NO ) );
    return doc;
  }

  private List<Document> search( Directory index, String name, int maxEditDistance ) throws IOException
  {
    BooleanQuery.Builder builder = new BooleanQuery.Builder();
    builder.add( new FuzzyQuery( new Term( NAME_FIELD, name ), maxEditDistance ), Occur.SHOULD );
    BooleanQuery query = builder.build();

    int maxHits = 50;
    try ( IndexReader reader = DirectoryReader.open( index ) )
    {
      IndexSearcher searcher = new IndexSearcher( reader );
      TopDocs docs = searcher.search( query, maxHits );
      ScoreDoc[] hits = docs.scoreDocs;
      List<Document> result = new ArrayList<>();
      for ( ScoreDoc hit : hits )
      {
        result.add( searcher.doc( hit.doc ) );
      }
      return result;
    }
  }

}

Run Code Online (Sandbox Code Playgroud)

归档时间：	6 年，2 月前
查看次数：	271 次
最近记录：	6 年，2 月前