| 470 | |
| 471 | <!-- configuration for japanese text, using a morphological analyzer |
| 472 | Most possibilities for customization are specified here in the schema. |
| 473 | |
| 474 | Note: you can set the default query operator to be OR, AND, or PHRASE: |
| 475 | OR: Use these defaults (autoGeneratePhraseQueries="false", <solrQueryParser defaultOperator="OR"/> |
| 476 | In this case Solr works like it does with the English language. The default query is OR, |
| 477 | but documents that contain more of the query terms get a special boost. You can probably |
| 478 | use a less aggressive stopwords/stoptags in this case, and its probably a good idea to use |
| 479 | enablePositionIncrements=true, so that if a user puts a query in quotes, they get a much more |
| 480 | exact phrase query. |
| 481 | AND: Set autoGeneratePhraseQueries=false, but set <solrQueryParser defaultOperator="AND"/> in |
| 482 | your schema.xml. Note if you do this, you should use a more aggressive stopwords/stoptags |
| 483 | list (at least at query-time), otherwise a document might not match simply because it does |
| 484 | not contain a prefix or particle. As in the above case, its probably a good idea to use |
| 485 | enablePositionIncrements=true for explicit phrase queries from the user. |
| 486 | PHRASE: Set autoGeneratePhraseQueries=true. If you do this, you should probably use both a very |
| 487 | aggressive stopwords list, and you should probably also set enablePositionIncrements=false |
| 488 | everywhere. Otherwise, even documents that contain the query's phrase in exact order will |
| 489 | not match because of slightly different grammatical structure. |
| 490 | --> |
| 491 | <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false"> |
| 492 | <analyzer> |
| 493 | <!-- map characters before the tokenizer: |
| 494 | Optionally, instead of the JapaneseWidthFactory, you can choose to do the width |
| 495 | mappings before the text is sent to the tokenizer. |
| 496 | <charFilter class="solr.MappingCharFilterFactory" mapping="@gosen_path@/conf/mapping-japanese.txt"/> |
| 497 | --> |
| 498 | |
| 499 | <!-- morphological tokenizer: sets the SURFACE form as the token, but also sets these attributes: |
| 500 | BasicFormAttribute, ConjugationAttribute, PartOfSpeechAttribute, PronunciationsAttribute, |
| 501 | ReadingsAttribute, and SentenceStartAttribute. |
| 502 | --> |
| 503 | <tokenizer class="solr.JapaneseTokenizerFactory"/> |
| 504 | |
| 505 | <!-- normalizes CJK width differences: |
| 506 | 1. Folds fullwidth ASCII variants into the equivalent basic latin |
| 507 | 2. Folds halfwidth Katakana variants into the equivalent kana |
| 508 | |
| 509 | Note: alternatively you can use a MappingCharFilter before the tokenizer for this, but please note |
| 510 | that mapping characters can change how Sen tokenizes text. |
| 511 | --> |
| 512 | <filter class="solr.JapaneseWidthFilterFactory"/> |
| 513 | |
| 514 | <!-- the punctuation filter removes all-punctuation tokens base on Unicode properties. |
| 515 | punctuation tokens are tagged as "unknown", and its better to do this than to remove |
| 516 | tokens with an unknown pos (as they might be valuable!). Because this punctuation |
| 517 | usually signifies a phrase or sentence boundary, enablePositionIncrements can be |
| 518 | used to prevent phrase queries from matching across natural phrase/sentence boundaries --> |
| 519 | <filter class="solr.JapanesePunctuationFilterFactory" enablePositionIncrements="true"/> |
| 520 | |
| 521 | <!-- this is a part-of-speech based stopfilter, it removes any tokens that have a certain |
| 522 | of speech. you can set enablePositionIncrements for tighter phrase queries --> |
| 523 | <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="@gosen_path@/conf/stoptags_ja.txt" enablePositionIncrements="true"/> |
| 524 | |
| 525 | <!-- a standard stopfilter, to specify specific stopwords. --> |
| 526 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="@gosen_path@/conf/stopwords_ja.txt" enablePositionIncrements="true"/> |
| 527 | |
| 528 | <!-- alternatively, instead of using a part-of-speech based stopfilter, you can use a |
| 529 | part-of-speech based keepfilter: specifying only the parts of speech you wish to index. |
| 530 | anything else will be removed. HOWEVER: this could be a little dangerous, because if |
| 531 | we upgrade ipadic they might add some new tags (the tags are fairly specific), and suddenly |
| 532 | things that you were indexing before are no longer being indexed. Its recommended to |
| 533 | use the part-of-speech based stopfilter above if at all possible, for safety. |
| 534 | <filter class="solr.JapanesePartOfSpeechKeepFilterFactory" tags="@gosen_path@/conf/keeptags_ja.txt" enablePositionIncrements="true"/> |
| 535 | --> |
| 536 | |
| 537 | <!-- before any stemming/lemmatization, you can protect words from being modified by specifying |
| 538 | a protwords.txt. |
| 539 | <filter class="solr.KeywordMarkerFilterFactory" protected="@gosen_path@/conf/protwords_ja.txt" ignoreCase="false"/> |
| 540 | |
| 541 | or you can also supply a custom stem dictionary for inflected forms (tab separated). No |
| 542 | further stemming/lemmatization will modify this. |
| 543 | <filter class="solr.StemmerOverrideFilterFactory" dictionary="dictionary.txt" ignoreCase="false"/> |
| 544 | --> |
| 545 | |
| 546 | <!-- the basic form filter converts inflected verbs and adjectives to their dictionary citation form. --> |
| 547 | <filter class="solr.JapaneseBasicFormFilterFactory"/> |
| 548 | |
| 549 | <!-- this filter heuristically normalizes katakana forms with a final prolonged sound mark --> |
| 550 | <filter class="solr.JapaneseKatakanaStemFilterFactory"/> |
| 551 | |
| 552 | <!-- you might want to lowercase for any english text content you have --> |
| 553 | <filter class="solr.LowerCaseFilterFactory"/> |
| 554 | </analyzer> |
| 555 | </fieldType> |