update kuromoji.js

This commit is contained in:
mzsm 2014-12-26 06:36:53 +09:00
parent f4007ed324
commit fe562facd3
2 changed files with 144 additions and 56 deletions

View File

@ -1,9 +1,33 @@
/// <reference path="kuromoji.d.ts" />
// From https://github.com/takuyaa/kuromoji.js/blob/master/README.md#usage
kuromoji.builder({ dicPath: "/url/to/dictionary/dir/" }).build(function (err, tokenizer) {
var path = tokenizer.tokenize("すもももももももものうち");
var num_tmp: number;
var str_tmp: string;
path.forEach((token)=>{
console.log(token.word_id);
console.log(token.surface_form);
num_tmp = token.word_id;
str_tmp = token.word_type;
num_tmp = token.word_position;
str_tmp = token.surface_form;
str_tmp = token.pos;
str_tmp = token.pos_detail_1;
str_tmp = token.pos_detail_2;
str_tmp = token.pos_detail_3;
str_tmp = token.conjugated_type;
str_tmp = token.conjugated_form;
str_tmp = token.basic_form;
str_tmp = token.reading;
str_tmp = token.pronunciation;
});
});
});
// From https://github.com/takuyaa/kuromoji.js/blob/master/test/resource/minimum-dic/minimum.csv
var minimum_dict = [
"すもも,1285,1285,7546,名詞,一般,*,*,*,*,すもも,スモモ,スモモ",
"もも,1285,1285,7219,名詞,一般,*,*,*,*,もも,モモ,モモ"
].join('\n');
var builder = kuromoji.dictionaryBuilder();
builder = builder.addTokenInfoDictionary(minimum_dict);
var dict = builder.build();

170
kuromoji/kuromoji.d.ts vendored
View File

@ -3,45 +3,114 @@
// Definitions by: MIZUSHIMA Junki <https://github.com/mzsm>
// Definitions: https://github.com/borisyankov/DefinitelyTyped
/// <reference path="../doublearray/doublearray.d.ts" />
declare module kuromoji {
interface TokenizerBuilderOption {
dicPath?: string;
// dict/ConnectionCosts.js
interface ConnectionCosts {
buffer: Int16Array;
put(forward_id: number, backward_id: number, cost: number): void;
get(forward_id: number, backward_id: number): number;
loadConnectionCosts(connection_costs_buffer: Int16Array): void;
}
interface TokenizerBuilder<T> {
build(callback: (err: Error, tokenizer: Tokenizer<T>) => void): void;
// dict/DynamicDictionaries.js
interface DynamicDictionaries {
trie: doublearray.DoubleArray;
token_info_dictionary: TokenInfoDictionary;
connection_costs: ConnectionCosts;
unknown_dictionary: UnknownDictionary;
loadTrie(base_buffer: Int32Array, check_buffer: Int32Array): DynamicDictionaries;
}
interface Tokenizer<T> {
token_info_dictionary: any;
unknown_dictionary: any;
viterbi_builder: ViterbiBuilder;
viterbi_searcher: ViterbiSearcher;
formatter: T;
tokenize(text: string): T[];
getLattice(text: string): ViterbiLattice;
// dict/TokenInfoDictionary.js
interface TokenInfoDictionary {
buildDictionary(entries: any[][]): {[word_id: number]: string};
put(left_id: number, right_id: number, word_cost: number, surface_form: string, feature: string): number;
addMapping(source: number, target: number): void;
targetMapToBuffer(): Uint8Array;
loadDictionary(array_buffer: Uint8Array): TokenInfoDictionary;
loadPosVector(array_buffer: Uint8Array): TokenInfoDictionary;
loadTargetMap(array_buffer: Uint8Array): TokenInfoDictionary;
getFeatures(token_info_id_str: string): string;
}
// dict/UnknownDictionary.js
interface UnknownDictionary extends TokenInfoDictionary {
}
// util/ByteBuffer.js
interface ByteBuffer {
buffer: Uint8Array;
position: number;
size(): number;
reallocate(): void;
shrink(): Uint8Array;
put(b: number): void;
get(index: number): number;
putShort(num: number): void;
getShort(index: number): number;
putInt(num: number): void;
getInt(index: number): number;
readInt(): number;
putString(str: string): void;
getString(index: number): string;
}
// util/DictionaryBuilder.js
interface DictionaryBuilder {
tid_entries: string[];
unk_entries: string[];
addTokenInfoDictionary(text: string): DictionaryBuilder;
costMatrix(matrix_text: string): DictionaryBuilder;
charDef(char_text: string): DictionaryBuilder;
unkDef(text: string): DictionaryBuilder;
build(): DynamicDictionaries;
buildTokenInfoDictionary(): {trie: doublearray.DoubleArray; token_info_dictionary: TokenInfoDictionary};
buildUnknownDictionary(): UnknownDictionary;
buildConnectionCosts(): ConnectionCosts;
buildDoubleArray(): doublearray.DoubleArray;
}
// util/IpadicFormatter.js
interface Formatter<T> {
formatEntry(word_id: number, position: number, type: string, features: string[]): T;
formatUnknownEntry(word_id: number, position: number, type: string, features: string[]): T;
}
interface IpadicFormatter extends Formatter<IpadicFeatures> {
}
export interface IpadicFeatures {
word_id: number;
word_type: string;
word_position: number;
surface_form: string;
pos: string;
pos_detail_1: string;
pos_detail_2: string;
pos_detail_3: string;
conjugated_type: string;
conjugated_form: string;
basic_form: string;
reading?: string;
pronunciation?: string;
}
// viterbi/ViterbiBuilder.js
interface ViterbiBuilder {
trie: any;
token_info_dictionary: any;
unknown_dictionary: any;
trie: doublearray.DoubleArray;
token_info_dictionary: TokenInfoDictionary;
unknown_dictionary: UnknownDictionary;
build(sentence_str: string): ViterbiLattice;
}
interface ViterbiSearcher {
connection_costs: any;
search(lattice: ViterbiLattice): ViterbiNode[];
forward(lattice: ViterbiLattice)
}
// viterbi/ViterbiLattice.js
interface ViterbiLattice {
append(node: ViterbiNode): void;
appendEos(): void;
}
// viterbi/ViterbiNode.js
interface ViterbiNode {
name: string;
cost: number;
@ -55,42 +124,37 @@ declare module kuromoji {
type: string;
}
interface IpadicFormatter {
formatEntry(word_id: number, position: number, type: string, features: string[]): IpadicFormat;
formatUnknownEntry(word_id: number, position: number, type: string, features: string[]): IpadicFormat;
// viterbi/ViterbiSearcher.js
interface ViterbiSearcher {
connection_costs: ConnectionCosts;
search(lattice: ViterbiLattice): ViterbiNode[];
forward(lattice: ViterbiLattice): ViterbiLattice;
backward(lattice: ViterbiLattice): ViterbiNode[];
}
interface IpadicFormat {
word_id: number;
word_type: string;
word_position: number;
surface_form: number;
pos: string;
pos_detail_1: string;
pos_detail_2: string;
pos_detail_3: string;
conjugated_type: string;
conjugated_form: string;
basic_form: string;
reading?: string;
pronunciation?: string;
// Tokenizer.js
interface TokenizerStatic {
splitByPunctuation(input: string): string[];
}
interface Tokenizer<T> {
token_info_dictionary: TokenInfoDictionary;
unknown_dictionary: UnknownDictionary;
viterbi_builder: ViterbiBuilder;
viterbi_searcher: ViterbiSearcher;
formatter: Formatter<T>;
tokenize(text: string): T[];
getLattice(text: string): ViterbiLattice;
}
interface DictionaryBuilder {
tid_entries: number[];
unk_entries: number[];
matrix_text: string;
char_text: string;
addTokenInfoDictionary(text: string): DictionaryBuilder;
costMatrix(matrix_text: string): DictionaryBuilder;
charDef(char_text: string): DictionaryBuilder;
unkDef(text: string): DictionaryBuilder;
build(): DynamicDictionaries;
// TokenizerBuilder.js
interface TokenizerBuilder<T> {
build(callback: (err: Error, tokenizer: Tokenizer<T>) => void): void;
}
interface TokenizerBuilderOption {
dicPath?: string;
}
interface DynamicDictionaries {
}
export function builder(option?: TokenizerBuilderOption): TokenizerBuilder<IpadicFormat>;
// kuromoji.js
export function builder(option: TokenizerBuilderOption): TokenizerBuilder<IpadicFeatures>;
export function dictionaryBuilder(): DictionaryBuilder;
}