Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to generate LM image and GC via two separate jobs #446

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from .reverse_arpa import *
from .vocabulary import *
from .srilm import *
from .util import *
2 changes: 1 addition & 1 deletion lm/lm_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def __init__(
extra_config=None,
extra_post_config=None,
encoding="utf-8",
mem=2,
mem=12,
Atticus1806 marked this conversation as resolved.
Show resolved Hide resolved
):
kwargs = locals()
del kwargs["self"]
Expand Down
24 changes: 24 additions & 0 deletions lm/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from typing import List, Tuple

import i6_core.rasr as rasr


def _has_image(c: rasr.RasrConfig, pc: rasr.RasrConfig):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I would either like to spell config and post_config out or have a small docstring here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with @Atticus1806

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with @Atticus1806

res = c._get("image") is not None
res = res or (pc is not None and pc._get("image") is not None)
return res


def find_arpa_lms(lm_config: rasr.RasrConfig, lm_post_config=None) -> List[Tuple[rasr.RasrConfig, rasr.RasrConfig]]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the post config also rasr.RasrConfig? Or some other type?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def find_arpa_lms(lm_config: rasr.RasrConfig, lm_post_config=None) -> List[Tuple[rasr.RasrConfig, rasr.RasrConfig]]:
def find_arpa_lms(lm_config: rasr.RasrConfig, lm_post_config: Optional[rasr.RasrConfig] = None) -> List[Tuple[rasr.RasrConfig, rasr.RasrConfig]]:

yes, post configs should also be of type RasrConfig
TODO: add Optional import above and check with black for line length :P

result = []

if lm_config.type == "ARPA":
if not _has_image(lm_config, lm_post_config):
Atticus1806 marked this conversation as resolved.
Show resolved Hide resolved
result.append((lm_config, lm_post_config))
elif lm_config.type == "combine":
for i in range(1, lm_config.num_lms + 1):
sub_lm_config = lm_config[f"lm-{i}"]
sub_lm_post_config = lm_post_config[f"lm-{i}"] if lm_post_config is not None else None
result += find_arpa_lms(sub_lm_config, sub_lm_post_config)

return result
2 changes: 1 addition & 1 deletion rasr/crp.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def set_executables(self, rasr_binary_path, rasr_arch="linux-x86_64-standard"):
self.flf_tool_exe = rasr_binary_path.join_right(f"flf-tool.{rasr_arch}")
self.kws_tool_exe = None # does not exist
self.lattice_processor_exe = rasr_binary_path.join_right(f"lattice-processor.{rasr_arch}")
self.lm_util_exe = None # does not exist
self.lm_util_exe = rasr_binary_path.join_right(f"lm-util.{rasr_arch}")
michelwi marked this conversation as resolved.
Show resolved Hide resolved
self.nn_trainer_exe = rasr_binary_path.join_right(f"nn-trainer.{rasr_arch}")
self.speech_recognizer_exe = rasr_binary_path.join_right(f"speech-recognizer.{rasr_arch}")

Expand Down
82 changes: 48 additions & 34 deletions recognition/advanced_tree_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

Path = setup_path(__package__)

import copy
import math
import os
import shutil
Expand Down Expand Up @@ -69,25 +70,6 @@ def run(self):
def cleanup_before_run(self, cmd, retry, *args):
util.backup_if_exists("lm_and_state_tree.log")

@classmethod
def find_arpa_lms(cls, lm_config, lm_post_config=None):
result = []

def has_image(c, pc):
res = c._get("image") is not None
res = res or (pc is not None and pc._get("image") is not None)
return res

if lm_config.type == "ARPA":
if not has_image(lm_config, lm_post_config):
result.append((lm_config, lm_post_config))
elif lm_config.type == "combine":
for i in range(1, lm_config.num_lms + 1):
sub_lm_config = lm_config["lm-%d" % i]
sub_lm_post_config = lm_post_config["lm-%d" % i] if lm_post_config is not None else None
result += cls.find_arpa_lms(sub_lm_config, sub_lm_post_config)
return result

@classmethod
def create_config(cls, crp, feature_scorer, extra_config, extra_post_config, **kwargs):
config, post_config = rasr.build_config_from_mapping(
Expand Down Expand Up @@ -117,7 +99,7 @@ def create_config(cls, crp, feature_scorer, extra_config, extra_post_config, **k
config.flf_lattice_tool.network.recognizer.feature_extraction.file = "dummy.flow"
config.flf_lattice_tool.network.recognizer.lm.scale = 1.0

arpa_lms = cls.find_arpa_lms(
arpa_lms = lm.find_arpa_lms(
config.flf_lattice_tool.network.recognizer.lm,
post_config.flf_lattice_tool.network.recognizer.lm if post_config is not None else None,
)
Expand Down Expand Up @@ -167,6 +149,7 @@ def __init__(
lmgc_mem: float = 12.0,
lmgc_alias: Optional[str] = None,
lmgc_scorer: Optional[rasr.FeatureScorer] = None,
separate_lmi_gc_generation: bool = True,
model_combination_config: Optional[rasr.RasrConfig] = None,
model_combination_post_config: Optional[rasr.RasrConfig] = None,
extra_config: Optional[rasr.RasrConfig] = None,
Expand All @@ -190,6 +173,7 @@ def __init__(
:param lmgc_mem: Memory requirement for the AdvancedTreeSearchLmImageAndGlobalCacheJob
:param lmgc_alias: Alias for the AdvancedTreeSearchLmImageAndGlobalCacheJob
:param lmgc_scorer: Dummy scorer for the AdvancedTreeSearchLmImageAndGlobalCacheJob which is required but unused
:param separate_lmi_gc_generation: Whether to generate the LM image and the global cache via two separate jobs for a more stable hash. Whether or not this flag is set is not part of the hash, so using separate jobs is the default.
:param model_combination_config: Configuration for model combination
:param model_combination_post_config: Post config for model combination
:param extra_config: Additional Config for recognition
Expand All @@ -206,6 +190,8 @@ def __init__(
self.config,
self.post_config,
self.lm_gc_job,
self.gc_job,
self.lm_image_jobs,
) = AdvancedTreeSearchJob.create_config(**kwargs)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is .create_config only called within this job? Otherwise we need to be careful with changing the returned variables. But I think you caught the cases here and otherwise the change is easy.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the AdvancedTreeSearchWithRescoringJob below in this file inherits from AdvancedTreeSearchJob and uses super().create_config. that should be fixed.

self.feature_flow = feature_flow
self.exe = self.select_exe(crp.flf_tool_exe, "flf-tool")
Expand Down Expand Up @@ -286,18 +272,46 @@ def create_config(
lmgc_mem: float,
lmgc_alias: Optional[str],
lmgc_scorer: Optional[rasr.FeatureScorer],
separate_lmi_gc_generation: bool,
model_combination_config: Optional[rasr.RasrConfig],
model_combination_post_config: Optional[rasr.RasrConfig],
extra_config: Optional[rasr.RasrConfig],
extra_post_config: Optional[rasr.RasrConfig],
**kwargs,
):
lm_gc = AdvancedTreeSearchLmImageAndGlobalCacheJob(
crp, lmgc_scorer if lmgc_scorer is not None else feature_scorer, extra_config, extra_post_config
)
if lmgc_alias is not None:
lm_gc.add_alias(lmgc_alias)
lm_gc.rqmt["mem"] = lmgc_mem
def specialize_lm_config(crp, lm_config):
NeoLegends marked this conversation as resolved.
Show resolved Hide resolved
crp = copy.deepcopy(crp)
crp.language_model_config = lm_config
return crp

if separate_lmi_gc_generation:
gc_job = BuildGlobalCacheJob(crp, extra_config, extra_post_config)
michelwi marked this conversation as resolved.
Show resolved Hide resolved

arpa_lms = lm.find_arpa_lms(crp.language_model_config, None)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function lm.find_arpa_lms only returns the LMs that do not already have an lm.image - because for those that already have an image we do not need to create a new one.

But the lm.image usually is defined in the post_config. When we here not pass the post config, then all (arpa) LMs are found and returned. Therefore we do extra work here.
And even worse: below in line 404/418 we call the function again, but with the post config. So if the original crp contain a mix of arpa LMs out of which some have already images and others do not, then the items in arpa_lms differ between the calls. And since we use the index to match the image to the LM, the mapping will be off and the wrong image will be assigned.

Suggested change
arpa_lms = lm.find_arpa_lms(crp.language_model_config, None)
arpa_lms = lm.find_arpa_lms(crp.language_model_config, crp.language_model_post_config)

lm_image_jobs = {
(i + 1): lm.CreateLmImageJob(
michelwi marked this conversation as resolved.
Show resolved Hide resolved
specialize_lm_config(crp, lm_config), extra_config=extra_config, extra_post_config=extra_post_config
)
for i, (lm_config, _lm_post_config) in enumerate(arpa_lms)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason for the _. Maybe I am overlooking something in the web view. If its not used you could just fully replace it with _

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the `post_config here is unused, because

  • both config and post_confic are extracted from the crp
  • only the config is modified
  • the crp with the old post_config is still being used

+1 to can be just _ to make clear it is unused

Suggested change
for i, (lm_config, _lm_post_config) in enumerate(arpa_lms)
for i, (lm_config, _) in enumerate(arpa_lms)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the `post_config here is unused, because

  • both config and post_confic are extracted from the crp
  • only the config is modified
  • the crp with the old post_config is still being used

+1 to can be just _ to make clear it is unused

Suggested change
for i, (lm_config, _lm_post_config) in enumerate(arpa_lms)
for i, (lm_config, _) in enumerate(arpa_lms)

}

gc = gc_job.out_global_cache
lm_images = {k: v.out_image for k, v in lm_image_jobs.items()}

lm_gc = None
else:
lm_gc = AdvancedTreeSearchLmImageAndGlobalCacheJob(
crp, lmgc_scorer if lmgc_scorer is not None else feature_scorer, extra_config, extra_post_config
)
if lmgc_alias is not None:
lm_gc.add_alias(lmgc_alias)
lm_gc.rqmt["mem"] = lmgc_mem

gc = lm_gc.out_global_cache
lm_images = lm_gc.out_lm_images

gc_job = None
lm_image_jobs = {}

search_parameters = cls.update_search_parameters(search_parameters)

Expand Down Expand Up @@ -397,14 +411,14 @@ def create_config(
]

post_config.flf_lattice_tool.global_cache.read_only = True
post_config.flf_lattice_tool.global_cache.file = lm_gc.out_global_cache
post_config.flf_lattice_tool.global_cache.file = gc

arpa_lms = AdvancedTreeSearchLmImageAndGlobalCacheJob.find_arpa_lms(
arpa_lms = lm.find_arpa_lms(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have called lm.find_arpa_lms above. Maybe move the above call out of the if condition and then reuse the arpa_lms from above here. This would also avoid mismatching items in the list as I outlined above.

config.flf_lattice_tool.network.recognizer.lm,
post_config.flf_lattice_tool.network.recognizer.lm,
)
for i, lm_config in enumerate(arpa_lms):
lm_config[1].image = lm_gc.out_lm_images[i + 1]
for i, (_lm_config, lm_post_config) in enumerate(arpa_lms):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above

lm_post_config.image = lm_images[i + 1]
michelwi marked this conversation as resolved.
Show resolved Hide resolved

# Remaining Flf-network

Expand Down Expand Up @@ -438,11 +452,11 @@ def create_config(
config._update(extra_config)
post_config._update(extra_post_config)

return config, post_config, lm_gc
return config, post_config, lm_gc, gc_job, lm_image_jobs

@classmethod
def hash(cls, kwargs):
config, post_config, lm_gc = cls.create_config(**kwargs)
config, post_config, *jobs = cls.create_config(**kwargs)
return super().hash(
{
"config": config,
Expand Down Expand Up @@ -817,7 +831,7 @@ class BuildGlobalCacheJob(rasr.RasrCommand, Job):
Standalone job to create the global-cache for advanced-tree-search
"""

def __init__(self, crp, extra_config=None, extra_post_config=None):
def __init__(self, crp, extra_config=None, extra_post_config=None, mem=12):
michelwi marked this conversation as resolved.
Show resolved Hide resolved
"""
:param rasr.CommonRasrParameters crp: common RASR params (required: lexicon, acoustic_model, language_model, recognizer)
:param rasr.Configuration extra_config: overlay config that influences the Job's hash
Expand All @@ -837,7 +851,7 @@ def __init__(self, crp, extra_config=None, extra_post_config=None):
self.out_log_file = self.log_file_output_path("build_global_cache", crp, False)
self.out_global_cache = self.output_path("global.cache", cached=True)

self.rqmt = {"time": 1, "cpu": 1, "mem": 2}
self.rqmt = {"time": 1, "cpu": 1, "mem": mem}

def tasks(self):
yield Task("create_files", mini_task=True)
Expand Down
Loading