kuromojiとマルコフ連鎖で格言を生成する - H2Databaseを追っかけていたりしたブログ

マルコフ連鎖と、格言や本日の占いのたぐいは相性がいいのではと思って、kuromojiを使って、ちょっと書いてみた。

senを使ってベイジアンフィルタを書いてみたときも、こんなに簡単に使えるのかと感心したのだけれど、kuromojiだと辞書も同梱なので、より一層簡単に使える。こういうものがオープンソースで使う事が出来るのだからすばらしいなあと思う。

400弱格言を集めてきて、元ネタとして使用してみたら、下記のような格言が生成された。わかるようなわからないような、というのが格言っぽいと思うけどどうだろう。

人と一緒になっても、百一人でも多く会うほうが楽だが、それが迷いであってはいけない。

ロジックとしてはこった事は特にしていない。名詞で始める事と、句点が来た場合は三分の一の確率でそこで終了する、くらい。

Key.java

package com.karatebancho.aphorism;

import org.atilika.kuromoji.Token;

class Key {
    private Token token1;
    private Token token2;
    private String key1;
    private String key2;

    @Override
    public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result + ((key1 == null) ? 0 : key1.hashCode());
        result = prime * result + ((key2 == null) ? 0 : key2.hashCode());
        return result;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj) {
            return true;
        }
        if (obj == null) {
            return false;
        }
        if (!(obj instanceof Key)) {
            return false;
        }
        Key other = (Key) obj;
        if (key1 == null) {
            if (other.key1 != null) {
                return false;
            }
        } else if (!key1.equals(other.key1)) {
            return false;
        }
        if (key2 == null) {
            if (other.key2 != null) {
                return false;
            }
        } else if (!key2.equals(other.key2)) {
            return false;
        }
        return true;
    }

    public Key(Token token1, Token token2) {
        super();
        this.token1 = token1;
        this.token2 = token2;
        this.key1 = token1.getSurfaceForm();
        this.key2 = token2.getSurfaceForm();
    }

    public Token getToken1() {
        return token1;
    }

    public Token getToken2() {
        return token2;
    }

    public boolean isStartWithNoun() {
        return token1.getAllFeaturesArray()[0].equals("名詞");
    }
}

App.java

package com.karatebancho.aphorism;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import org.atilika.kuromoji.Token;
import org.atilika.kuromoji.Tokenizer;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;

/**
 * Hello world!
 */
public class App {
    public static String extract(String str) {
        if (str == null || str.length() == 0) {
            return "";
        }
        return str;
    }

    public static void main(String[] args) throws Exception {
        Multimap<Key, Token> dic = createDic();
        createAphorism(dic);
    }

    public static void createAphorism(Multimap<Key, Token> dic) {
        List<Token> aphorism = new ArrayList<Token>();
        Random rand = new Random(new Date().getTime());
        Key key = get(dic.keySet(), rand.nextInt(dic.keySet().size()));
        while (!key.isStartWithNoun()) {
            key = get(dic.keySet(), rand.nextInt(dic.keySet().size()));
        }
        aphorism.add(key.getToken1());
        aphorism.add(key.getToken2());
        while (dic.containsKey(key)) {
            Token v = get(dic.get(key), rand.nextInt(dic.get(key).size()));
            aphorism.add(v);
            key = new Key(key.getToken2(), v);
            if (v.getSurfaceForm().equals("。") && finish()) {
                break;
            }
        }
        System.out.println(toString(aphorism));
    }

    public static String toString(List<Token> aphorism) {
        StringBuilder sb = new StringBuilder();
        for (Token token : aphorism) {
            sb.append(token.getSurfaceForm());
        }
        return sb.toString();
    }

    public static boolean finish() {
        return new Random().nextInt(3) <= 1;
    }

    public static <T> T get(Collection<T> col, int pos) {
        if (col == null || col.size() == 0 || pos > col.size()) {
            System.out.println(String.format("size : %d pos : %d", col.size(), pos));
            return null;
        }
        Iterator<T> itr = col.iterator();
        T str = null;
        for (int i = 0; i <= pos; i++) {
            if (itr.hasNext()) {
                str = itr.next();
            }
        }
        if (str == null) {
            System.out.println(String.format("size : %d pos : %d", col.size(), pos));
        }
        return str;
    }

    public static Multimap<Key, Token> createDic() throws Exception {
        File f = new File(App.class.getResource("/aphorism.txt").getFile());
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
        String str;
        Multimap<Key, Token> dic = ArrayListMultimap.create();
        Token key1 = null;
        Token key2 = null;
        Token val = null;
        while ((str = br.readLine()) != null) {
            Tokenizer tokenizer = Tokenizer.builder().build();
            for (Token token : tokenizer.tokenize(extract(str))) {
                key1 = key2;
                key2 = val;
                val = token;
                if (key1 != null && key2 != null) {
                    Key key = new Key(key1, key2);
                    dic.put(key, val);
                }
            }
        }
        br.close();
        return dic;
    }
}

pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>

	<groupId>com.karatebancho</groupId>
	<artifactId>aphorism</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<packaging>jar</packaging>
	<name>aphorism</name>
	<url>http://maven.apache.org</url>
	<properties>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
	</properties>
	<repositories>
		<repository>
			<id>Atilika Open Source repository</id>
			<url>http://www.atilika.org/nexus/content/repositories/atilika</url>
		</repository>
	</repositories>
	<dependencies>
		<dependency>
			<groupId>com.google.collections</groupId>
			<artifactId>google-collections</artifactId>
			<version>1.0</version>
		</dependency>
		<dependency>
			<groupId>org.atilika.kuromoji</groupId>
			<artifactId>kuromoji</artifactId>
			<version>0.7.7</version>
			<type>jar</type>
			<scope>compile</scope>
		</dependency>
	</dependencies>
</project>