1 | package dualist.ja; |
---|
2 | |
---|
3 | import cc.mallet.pipe.Pipe; |
---|
4 | import cc.mallet.extract.StringSpan; |
---|
5 | import cc.mallet.extract.StringTokenization; |
---|
6 | import cc.mallet.types.Instance; |
---|
7 | import cc.mallet.types.TokenSequence; |
---|
8 | |
---|
9 | import org.chasen.mecab.Tagger; |
---|
10 | import org.chasen.mecab.Node; |
---|
11 | |
---|
12 | public class SimpleMecabPipe extends Pipe |
---|
13 | { |
---|
14 | static { |
---|
15 | try { |
---|
16 | System.loadLibrary("mecab-java"); |
---|
17 | } catch (UnsatisfiedLinkError e) { |
---|
18 | System.err.println("ERROR: Failed to load mecab-java native code."); |
---|
19 | System.err.println(e); |
---|
20 | System.exit(1); |
---|
21 | } |
---|
22 | } |
---|
23 | |
---|
24 | public Instance pipe (Instance carrier) |
---|
25 | { |
---|
26 | CharSequence input = (CharSequence) carrier.getData(); |
---|
27 | String string = input.toString(); |
---|
28 | Tagger tagger = new Tagger(); |
---|
29 | Node node = tagger.parseToNode(string); |
---|
30 | int cursor = 0; |
---|
31 | TokenSequence ts = new StringTokenization(input); |
---|
32 | while (node != null) { |
---|
33 | node = node.getNext(); |
---|
34 | if (node == null) break; |
---|
35 | String[] f = node.getFeature().split(","); |
---|
36 | if (f[0].equals("名詞") && |
---|
37 | !f[1].equals("数") && !f[1].equals("サ変接続") && !f[1].equals("接尾") || |
---|
38 | f[0].equals("未知語")) { |
---|
39 | String surface = node.getSurface(); |
---|
40 | cursor = string.indexOf(surface, cursor); |
---|
41 | ts.add (new StringSpan(input, cursor, cursor + surface.length())); |
---|
42 | } |
---|
43 | } |
---|
44 | carrier.setData(ts); |
---|
45 | return carrier; |
---|
46 | } |
---|
47 | } |
---|