Skip to content

Commit

Permalink
✨ cgspell limit/beam/max-weight configurable from pipespec.xml
Browse files Browse the repository at this point in the history
E.g.

    <cgspell limit="5" beam="20" max-weight="2000">
  • Loading branch information
unhammer committed May 13, 2020
1 parent 2e7768d commit a0533a7
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 8 deletions.
10 changes: 8 additions & 2 deletions src/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ void CGCmd::run(stringstream& input, stringstream& output) const
}

#ifdef HAVE_CGSPELL
CGSpellCmd::CGSpellCmd (hfst_ospell::Transducer* errmodel, hfst_ospell::Transducer* acceptor, float max_sent_unknown_rate, bool verbose)
CGSpellCmd::CGSpellCmd (hfst_ospell::Transducer* errmodel, hfst_ospell::Transducer* acceptor, int limit, float beam, float max_weight, float max_sent_unknown_rate, bool verbose)
: speller(new Speller(errmodel, acceptor, verbose, max_analysis_weight, max_weight, real_word, limit, beam, time_cutoff, max_sent_unknown_rate))
{
if (!acceptor) {
Expand All @@ -97,7 +97,7 @@ CGSpellCmd::CGSpellCmd (hfst_ospell::Transducer* errmodel, hfst_ospell::Transduc
throw std::runtime_error("libdivvun: ERROR: CGSpell command couldn't read errmodel");
}
}
CGSpellCmd::CGSpellCmd (const string& err_path, const string& lex_path, float max_sent_unknown_rate, bool verbose)
CGSpellCmd::CGSpellCmd (const string& err_path, const string& lex_path, int limit, float beam, float max_weight, float max_sent_unknown_rate, bool verbose)
: speller(new Speller(err_path, lex_path, verbose, max_analysis_weight, max_weight, real_word, limit, beam, time_cutoff, max_sent_unknown_rate))
{
}
Expand Down Expand Up @@ -211,6 +211,9 @@ Pipeline Pipeline::mkPipeline(const unique_ptr<ArPipeSpec>& ar_spec, const u16st
};
auto* s = new CGSpellCmd(readArchiveExtract(ar_spec->ar_path, args["errmodel"], f),
readArchiveExtract(ar_spec->ar_path, args["lexicon"], f),
cmd.attribute("limit").as_int(10),
cmd.attribute("beam").as_float(15.0),
cmd.attribute("max-weight").as_float(5000.0),
cmd.attribute("max-unknown-rate").as_float(0.4),
verbose);
cmds.emplace_back(s);
Expand Down Expand Up @@ -292,6 +295,9 @@ Pipeline Pipeline::mkPipeline(const unique_ptr<PipeSpec>& spec, const u16string&
#ifdef HAVE_CGSPELL
cmds.emplace_back(new CGSpellCmd(args["errmodel"],
args["lexicon"],
cmd.attribute("limit").as_int(10),
cmd.attribute("beam").as_float(15.0),
cmd.attribute("max-weight").as_float(5000.0),
cmd.attribute("max-unknown-rate").as_float(0.4),
verbose));
#else
Expand Down
7 changes: 2 additions & 5 deletions src/pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,17 +167,14 @@ class CGCmd: public PipeCmd {
#ifdef HAVE_CGSPELL
class CGSpellCmd: public PipeCmd {
public:
CGSpellCmd (hfst_ospell::Transducer* errmodel, hfst_ospell::Transducer* acceptor, float max_sent_unknown_rate, bool verbose);
CGSpellCmd (const string& err_path, const string& lex_path, float max_sent_unknown_rate, bool verbose);
CGSpellCmd (hfst_ospell::Transducer* errmodel, hfst_ospell::Transducer* acceptor, int limit, float beam, float max_weight, float max_sent_unknown_rate, bool verbose);
CGSpellCmd (const string& err_path, const string& lex_path, int limit, float beam, float max_weight, float max_sent_unknown_rate, bool verbose);
void run(stringstream& input, stringstream& output) const override;
~CGSpellCmd() override = default;
// Some sane defaults for the speller
// TODO: Do we want any of this configurable from pipespec.xml, or from the Checker API?
static constexpr Weight max_analysis_weight = -1.0;
static constexpr Weight max_weight = 5000.0;
static constexpr bool real_word = false;
static constexpr unsigned long limit = 10;
static constexpr hfst_ospell::Weight beam = 15.0;
static constexpr float time_cutoff = 0.0;
private:
unique_ptr<Speller> speller;
Expand Down
8 changes: 7 additions & 1 deletion src/pipespec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,14 @@ vector<std::pair<string,string>> toPipeSpecShVector(const string& dir, const Pip
prog += " -g" + argprepare(dir, args["grammar"]);
}
else if(name == "cgspell") {
int limit = cmd.attribute("limit").as_int(10);
float beam = cmd.attribute("beam").as_float(15);
float max_weight = cmd.attribute("max-weight").as_float(5000);
float max_sent_unknown_rate = cmd.attribute("max-unknown-rate").as_float(0.4);
prog = "divvun-cgspell -n 10 -b 15 -w 5000";
prog = "divvun-cgspell";
prog += " -n " + std::to_string(limit);
prog += " -b " + std::to_string(beam);
prog += " -w " + std::to_string(max_weight);
prog += " -u " + std::to_string(max_sent_unknown_rate);
prog += " -l" + argprepare(dir, args["lexicon"]);
prog += " -m" + argprepare(dir, args["errmodel"]);
Expand Down
3 changes: 3 additions & 0 deletions src/pipespec.dtd
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@
<!ELEMENT cg (grammar)> <!-- arg: grammar.cg3 -->
<!ELEMENT cgspell ((lexicon, errmodel)|(errmodel, lexicon))> <!-- arg1: acceptor.hfstol, arg2: errmodel.hfst -->
<!ATTLIST cgspell
limit CDATA "10"
beam CDATA "15.0"
max-weight CDATA "5000.0"
max-unknown-rate CDATA "0.4">
<!ELEMENT tokenize (tokenizer)> <!-- arg: tokeniser.pmhfst -->
<!ELEMENT tokenise (tokenizer)> <!-- en_GB alias of the above -->
Expand Down
3 changes: 3 additions & 0 deletions src/pipespec.rnc
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ cgspell =
# arg1: acceptor.hfstol, arg2: errmodel.hfst
attlist.cgspell &=

[ a:defaultValue = "10" ] attribute limit { text }?,
[ a:defaultValue = "15.0" ] attribute beam { text }?,
[ a:defaultValue = "5000.0" ] attribute max-weight { text }?,
[ a:defaultValue = "0.4" ] attribute max-unknown-rate { text }?
tokenize = element tokenize { attlist.tokenize, tokenizer }
# arg: tokeniser.pmhfst
Expand Down

0 comments on commit a0533a7

Please sign in to comment.