diff --git a/kuromoji/kuromoji-tests.ts b/kuromoji/kuromoji-tests.ts index d19e7f140a..0355a6501e 100644 --- a/kuromoji/kuromoji-tests.ts +++ b/kuromoji/kuromoji-tests.ts @@ -1,9 +1,33 @@ /// +// From https://github.com/takuyaa/kuromoji.js/blob/master/README.md#usage kuromoji.builder({ dicPath: "/url/to/dictionary/dir/" }).build(function (err, tokenizer) { var path = tokenizer.tokenize("すもももももももものうち"); + var num_tmp: number; + var str_tmp: string; path.forEach((token)=>{ - console.log(token.word_id); - console.log(token.surface_form); + num_tmp = token.word_id; + str_tmp = token.word_type; + num_tmp = token.word_position; + str_tmp = token.surface_form; + str_tmp = token.pos; + str_tmp = token.pos_detail_1; + str_tmp = token.pos_detail_2; + str_tmp = token.pos_detail_3; + str_tmp = token.conjugated_type; + str_tmp = token.conjugated_form; + str_tmp = token.basic_form; + str_tmp = token.reading; + str_tmp = token.pronunciation; }); -}); \ No newline at end of file +}); + +// From https://github.com/takuyaa/kuromoji.js/blob/master/test/resource/minimum-dic/minimum.csv +var minimum_dict = [ + "すもも,1285,1285,7546,名詞,一般,*,*,*,*,すもも,スモモ,スモモ", + "もも,1285,1285,7219,名詞,一般,*,*,*,*,もも,モモ,モモ" +].join('\n'); + +var builder = kuromoji.dictionaryBuilder(); +builder = builder.addTokenInfoDictionary(minimum_dict); +var dict = builder.build(); \ No newline at end of file diff --git a/kuromoji/kuromoji.d.ts b/kuromoji/kuromoji.d.ts index 7ee5ca92af..ed1cd9b80f 100644 --- a/kuromoji/kuromoji.d.ts +++ b/kuromoji/kuromoji.d.ts @@ -3,45 +3,114 @@ // Definitions by: MIZUSHIMA Junki // Definitions: https://github.com/borisyankov/DefinitelyTyped +/// + declare module kuromoji { - interface TokenizerBuilderOption { - dicPath?: string; + // dict/ConnectionCosts.js + interface ConnectionCosts { + buffer: Int16Array; + put(forward_id: number, backward_id: number, cost: number): void; + get(forward_id: number, backward_id: number): number; + loadConnectionCosts(connection_costs_buffer: Int16Array): void; } - interface TokenizerBuilder { - build(callback: (err: Error, tokenizer: Tokenizer) => void): void; + // dict/DynamicDictionaries.js + interface DynamicDictionaries { + trie: doublearray.DoubleArray; + token_info_dictionary: TokenInfoDictionary; + connection_costs: ConnectionCosts; + unknown_dictionary: UnknownDictionary; + loadTrie(base_buffer: Int32Array, check_buffer: Int32Array): DynamicDictionaries; } - interface Tokenizer { - token_info_dictionary: any; - unknown_dictionary: any; - viterbi_builder: ViterbiBuilder; - viterbi_searcher: ViterbiSearcher; - formatter: T; - tokenize(text: string): T[]; - getLattice(text: string): ViterbiLattice; + // dict/TokenInfoDictionary.js + interface TokenInfoDictionary { + buildDictionary(entries: any[][]): {[word_id: number]: string}; + put(left_id: number, right_id: number, word_cost: number, surface_form: string, feature: string): number; + addMapping(source: number, target: number): void; + targetMapToBuffer(): Uint8Array; + loadDictionary(array_buffer: Uint8Array): TokenInfoDictionary; + loadPosVector(array_buffer: Uint8Array): TokenInfoDictionary; + loadTargetMap(array_buffer: Uint8Array): TokenInfoDictionary; + getFeatures(token_info_id_str: string): string; } + // dict/UnknownDictionary.js + interface UnknownDictionary extends TokenInfoDictionary { + } + + // util/ByteBuffer.js + interface ByteBuffer { + buffer: Uint8Array; + position: number; + size(): number; + reallocate(): void; + shrink(): Uint8Array; + put(b: number): void; + get(index: number): number; + putShort(num: number): void; + getShort(index: number): number; + putInt(num: number): void; + getInt(index: number): number; + readInt(): number; + putString(str: string): void; + getString(index: number): string; + } + + // util/DictionaryBuilder.js + interface DictionaryBuilder { + tid_entries: string[]; + unk_entries: string[]; + addTokenInfoDictionary(text: string): DictionaryBuilder; + costMatrix(matrix_text: string): DictionaryBuilder; + charDef(char_text: string): DictionaryBuilder; + unkDef(text: string): DictionaryBuilder; + build(): DynamicDictionaries; + buildTokenInfoDictionary(): {trie: doublearray.DoubleArray; token_info_dictionary: TokenInfoDictionary}; + buildUnknownDictionary(): UnknownDictionary; + buildConnectionCosts(): ConnectionCosts; + buildDoubleArray(): doublearray.DoubleArray; + } + + // util/IpadicFormatter.js + interface Formatter { + formatEntry(word_id: number, position: number, type: string, features: string[]): T; + formatUnknownEntry(word_id: number, position: number, type: string, features: string[]): T; + } + interface IpadicFormatter extends Formatter { + } + export interface IpadicFeatures { + word_id: number; + word_type: string; + word_position: number; + surface_form: string; + pos: string; + pos_detail_1: string; + pos_detail_2: string; + pos_detail_3: string; + conjugated_type: string; + conjugated_form: string; + basic_form: string; + reading?: string; + pronunciation?: string; + } + + // viterbi/ViterbiBuilder.js interface ViterbiBuilder { - trie: any; - token_info_dictionary: any; - unknown_dictionary: any; + trie: doublearray.DoubleArray; + token_info_dictionary: TokenInfoDictionary; + unknown_dictionary: UnknownDictionary; build(sentence_str: string): ViterbiLattice; - - } - - interface ViterbiSearcher { - connection_costs: any; - search(lattice: ViterbiLattice): ViterbiNode[]; - forward(lattice: ViterbiLattice) } + // viterbi/ViterbiLattice.js interface ViterbiLattice { append(node: ViterbiNode): void; appendEos(): void; } + // viterbi/ViterbiNode.js interface ViterbiNode { name: string; cost: number; @@ -55,42 +124,37 @@ declare module kuromoji { type: string; } - interface IpadicFormatter { - formatEntry(word_id: number, position: number, type: string, features: string[]): IpadicFormat; - formatUnknownEntry(word_id: number, position: number, type: string, features: string[]): IpadicFormat; + // viterbi/ViterbiSearcher.js + interface ViterbiSearcher { + connection_costs: ConnectionCosts; + search(lattice: ViterbiLattice): ViterbiNode[]; + forward(lattice: ViterbiLattice): ViterbiLattice; + backward(lattice: ViterbiLattice): ViterbiNode[]; } - interface IpadicFormat { - word_id: number; - word_type: string; - word_position: number; - surface_form: number; - pos: string; - pos_detail_1: string; - pos_detail_2: string; - pos_detail_3: string; - conjugated_type: string; - conjugated_form: string; - basic_form: string; - reading?: string; - pronunciation?: string; + // Tokenizer.js + interface TokenizerStatic { + splitByPunctuation(input: string): string[]; + } + interface Tokenizer { + token_info_dictionary: TokenInfoDictionary; + unknown_dictionary: UnknownDictionary; + viterbi_builder: ViterbiBuilder; + viterbi_searcher: ViterbiSearcher; + formatter: Formatter; + tokenize(text: string): T[]; + getLattice(text: string): ViterbiLattice; } - interface DictionaryBuilder { - tid_entries: number[]; - unk_entries: number[]; - matrix_text: string; - char_text: string; - addTokenInfoDictionary(text: string): DictionaryBuilder; - costMatrix(matrix_text: string): DictionaryBuilder; - charDef(char_text: string): DictionaryBuilder; - unkDef(text: string): DictionaryBuilder; - build(): DynamicDictionaries; + // TokenizerBuilder.js + interface TokenizerBuilder { + build(callback: (err: Error, tokenizer: Tokenizer) => void): void; + } + interface TokenizerBuilderOption { + dicPath?: string; } - interface DynamicDictionaries { - } - - export function builder(option?: TokenizerBuilderOption): TokenizerBuilder; + // kuromoji.js + export function builder(option: TokenizerBuilderOption): TokenizerBuilder; export function dictionaryBuilder(): DictionaryBuilder; } \ No newline at end of file