/*************************************************************************/ /* Copyright (c) 2013 Linas Vepstas */ /* All rights reserved */ /* */ /* Use of the Viterbi parsing system is subject to the terms of the */ /* license set forth in the LICENSE file included with this software. */ /* This license allows free redistribution and use in source and binary */ /* forms, with or without modification, subject to certain conditions. */ /* */ /*************************************************************************/ #include "compress.h" using namespace std; namespace link_grammar { namespace viterbi { // Merge a pair of word-lists, if they are mergable. // Internal-use only utility for the function below. static Seq* merge_wordlists(Seq* wla, Seq* wlb) { size_t sza = wla->get_arity(); size_t szb = wlb->get_arity(); if (sza != szb) return NULL; bool already_merged = false; OutList seq; for (size_t i = 0; i < sza; i++) { Atom* a = wla->get_outgoing_atom(i); Atom* b = wlb->get_outgoing_atom(i); if (a->operator==(b)) { seq.push_back(a); continue; } // If we are here, then the word csets differ. if (already_merged) return NULL; already_merged = true; // Check to see if the words are the same. WordCset* wcsa = dynamic_cast(a); WordCset* wcsb = dynamic_cast(b); Word* wa = wcsa->get_word(); Word* wb = wcsb->get_word(); // Words differ, its not mergable. if (not wa->operator==(wb)) return NULL; // OK, all is well, merge the connectors. Atom* csa = wcsa->get_cset(); Atom* csb = wcsb->get_cset(); Or* dj = new Or(csa, csb); dj = dj->flatten(); dj = dj->uniq(); WordCset* merge = new WordCset(wa, dj); seq.push_back(merge); } return new Seq(seq); } /// Try to shrink down a set of alternatives by collapsing repeated /// states into disjuncts. Best explained by example: Suppose that /// the input is /// SET : /// STATE_TRIPLE : /// SEQ : /// WORD : is /// WORD : a /// WORD : test /// SEQ : /// WORD_CSET : /// WORD : this /// CONNECTOR : Ss*b+ /// WORD_CSET : /// WORD : LEFT-WALL /// CONNECTOR : Wd+ /// SEQ : /// STATE_TRIPLE : /// SEQ : /// WORD : is /// WORD : a /// WORD : test /// SEQ : /// WORD_CSET : /// WORD : this /// CONNECTOR : Ss*b+ /// WORD_CSET : /// WORD : LEFT-WALL /// CONNECTOR : Wi+ /// SEQ : /// /// Then the compressed output will be: /// SET : /// STATE_TRIPLE : /// SEQ : /// WORD : is /// WORD : a /// WORD : test /// SEQ : /// WORD_CSET : /// WORD : this /// CONNECTOR : Ss*b+ /// WORD_CSET : /// WORD : LEFT-WALL /// OR : /// CONNECTOR : Wd+ /// CONNECTOR : Wi+ /// SEQ : /// /// Note how two alternative state triples were collapsed down into a /// single word cset. The goal of this compression is to shrink down /// the state to make the output more tractable, slightly less /// combinatoric-explosion-y. // // XXX TODO: if we created a special atom that held only alternatives, // then this would be a method on that that atom. Do we need such an // atom? It could help avoid confusion ... Set* compress_alternatives(Set* state_alternatives) { OutList alts; Seq* prev_in = NULL; Set* prev_out = NULL; Seq* merged = NULL; foreach_outgoing(StateTriple*, sp, state_alternatives) { // If the inputs or the outputs differ, the state triples are // fundamentally not mergable. Move along. Seq* in = sp->get_input(); Set* out = sp->get_output(); if ((not in->operator==(prev_in)) or (not out->operator==(prev_out))) { if (merged) alts.push_back(new StateTriple(prev_in, merged, prev_out)); prev_in = in; prev_out = out; merged = sp->get_state(); continue; } Seq* m = merge_wordlists(merged, sp->get_state()); if (NULL == m) { alts.push_back(sp); continue; } merged = m; } if (merged) alts.push_back(new StateTriple(prev_in, merged, prev_out)); return new Set(alts); } } // namespace viterbi } // namespace link-grammar