Context Navigation

Back to Ticket #31155

Ticket #31155: patch-solr-ja.2.diff

File patch-solr-ja.2.diff, 8.0 KB (added by humem (humem), 13 years ago)

example/solr-ja/conf/schema.xml

-                      old
+                      new
     See http://wiki.apache.org/solr/SpatialSearch
    -->
     <fieldtype name="geohash" class="solr.GeoHashField"/>
+    <!-- configuration for japanese text, using a morphological analyzer
+      Most possibilities for customization are specified here in the schema.
+      Note: you can set the default query operator to be OR, AND, or PHRASE:
+       OR: Use these defaults (autoGeneratePhraseQueries="false", <solrQueryParser defaultOperator="OR"/>
+           In this case Solr works like it does with the English language. The default query is OR,
+           but documents that contain more of the query terms get a special boost. You can probably
+           use a less aggressive stopwords/stoptags in this case, and its probably a good idea to use
+           enablePositionIncrements=true, so that if a user puts a query in quotes, they get a much more
+           exact phrase query.
+       AND: Set autoGeneratePhraseQueries=false, but set <solrQueryParser defaultOperator="AND"/> in
+           your schema.xml. Note if you do this, you should use a more aggressive stopwords/stoptags
+           list (at least at query-time), otherwise a document might not match simply because it does
+           not contain a prefix or particle. As in the above case, its probably a good idea to use
+           enablePositionIncrements=true for explicit phrase queries from the user.
+       PHRASE: Set autoGeneratePhraseQueries=true. If you do this, you should probably use both a very
+           aggressive stopwords list, and you should probably also set enablePositionIncrements=false
+           everywhere.  Otherwise, even documents that contain the query's phrase in exact order will
+           not match because of slightly different grammatical structure.
+    -->
+    <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
+      <analyzer>
+        <!-- map characters before the tokenizer:
+             Optionally, instead of the JapaneseWidthFactory, you can choose to do the width
+             mappings before the text is sent to the tokenizer.
+        <charFilter class="solr.MappingCharFilterFactory" mapping="@gosen_path@/conf/mapping-japanese.txt"/>
+        -->
+        <!-- morphological tokenizer: sets the SURFACE form as the token, but also sets these attributes:
+             BasicFormAttribute, ConjugationAttribute, PartOfSpeechAttribute, PronunciationsAttribute,
+             ReadingsAttribute, and SentenceStartAttribute.
+        -->
+        <tokenizer class="solr.JapaneseTokenizerFactory"/>
+        <!-- normalizes CJK width differences:
+. Folds fullwidth ASCII variants into the equivalent basic latin
+. Folds halfwidth Katakana variants into the equivalent kana
+             Note: alternatively you can use a MappingCharFilter before the tokenizer for this, but please note
+             that mapping characters can change how Sen tokenizes text.
+        -->
+        <filter class="solr.JapaneseWidthFilterFactory"/>
+        <!-- the punctuation filter removes all-punctuation tokens base on Unicode properties.
+             punctuation tokens are tagged as "unknown", and its better to do this than to remove
+             tokens with an unknown pos (as they might be valuable!). Because this punctuation
+             usually signifies a phrase or sentence boundary, enablePositionIncrements can be
+             used to prevent phrase queries from matching across natural phrase/sentence boundaries -->
+        <filter class="solr.JapanesePunctuationFilterFactory" enablePositionIncrements="true"/>
+        <!-- this is a part-of-speech based stopfilter, it removes any tokens that have a certain
+             of speech. you can set enablePositionIncrements for tighter phrase queries -->
+        <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="@gosen_path@/conf/stoptags_ja.txt" enablePositionIncrements="true"/>
+        <!-- a standard stopfilter, to specify specific stopwords. -->
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="@gosen_path@/conf/stopwords_ja.txt" enablePositionIncrements="true"/>
+        <!-- alternatively, instead of using a part-of-speech based stopfilter, you can use a
+             part-of-speech based keepfilter: specifying only the parts of speech you wish to index.
+             anything else will be removed. HOWEVER: this could be a little dangerous, because if
+             we upgrade ipadic they might add some new tags (the tags are fairly specific), and suddenly
+             things that you were indexing before are no longer being indexed. Its recommended to
+             use the part-of-speech based stopfilter above if at all possible, for safety.
+        <filter class="solr.JapanesePartOfSpeechKeepFilterFactory" tags="@gosen_path@/conf/keeptags_ja.txt" enablePositionIncrements="true"/>
+        -->
+        <!-- before any stemming/lemmatization, you can protect words from being modified by specifying
+             a protwords.txt.
+        <filter class="solr.KeywordMarkerFilterFactory" protected="@gosen_path@/conf/protwords_ja.txt" ignoreCase="false"/>
+             or you can also supply a custom stem dictionary for inflected forms (tab separated). No
+             further stemming/lemmatization will modify this.
+        <filter class="solr.StemmerOverrideFilterFactory" dictionary="dictionary.txt" ignoreCase="false"/>
+        -->
+        <!-- the basic form filter converts inflected verbs and adjectives to their dictionary citation form. -->
+        <filter class="solr.JapaneseBasicFormFilterFactory"/>
+        <!-- this filter heuristically normalizes katakana forms with a final prolonged sound mark -->
+        <filter class="solr.JapaneseKatakanaStemFilterFactory"/>
+        <!-- you might want to lowercase for any english text content you have -->
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
  </types>
 …
    <!-- catchall field, containing all other searchable text fields (implemented
         via copyField further on in this schema  -->
    <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
+   <field name="text" type="text_ja" indexed="true" stored="false" multiValued="true"/>
    <!-- catchall text field that indexes tokens both normally and in reverse for efficient
         leading wildcard queries. -->

example/solr-ja/conf/solrconfig.xml

-                      old
+                      new
        is found that matches, it will be ignored
     -->
   <lib dir="../../contrib/clustering/lib/" />
+  <lib dir="@gosen_path@/lib/" />
   <lib dir="/total/crap/dir/ignored" />
   <!-- an exact path can be used to specify a specific file.  This
        will cause a serious error to be logged if it can't be loaded.
 …
        <str name="wt">velocity</str>
        <str name="v.template">browse</str>
+       <str name="v.properties">velocity.properties</str>
        <str name="v.layout">layout</str>
        <str name="title">Solritas</str>

example/solr-ja/conf/velocity/head.vm

-                      old
+                      new
            extraParams:{
              'terms.prefix': function() { return $("\#q").val();},
              'terms.sort': 'count',
              'terms.fl': 'name',
+             'terms.fl': 'text',
              'wt': 'velocity',
              'v.template': 'suggest'
+           }

example/solr-ja/conf/velocity/suggest.vm

old	new
1		#foreach($t in $response.response.terms.~~name~~)
	1	#foreach($t in $response.response.terms.text)
2	2	$t.key
3	3	#end
	4	No newline at end of file

Download in other formats:

Original Format