Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Speed-up readxml by caching key lookup instead of using try/except #1691

Merged
merged 3 commits into from
Nov 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 13 additions & 11 deletions src/pyhf/readxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,19 +59,21 @@ def import_root_histogram(rootdir, filename, path, name, filecache=None):
fullpath = str(Path(rootdir).joinpath(filename))
if fullpath not in filecache:
f = uproot.open(fullpath)
filecache[fullpath] = f
keys = set(f.keys(cycle=False))
filecache[fullpath] = (f, keys)
else:
f = filecache[fullpath]
try:
f, keys = filecache[fullpath]

fullname = "/".join([path, name])

if name in keys:
hist = f[name]
except (KeyError, uproot.deserialization.DeserializationError):
fullname = "/".join([path, name])
try:
hist = f[fullname]
except KeyError:
raise KeyError(
f'Both {name} and {fullname} were tried and not found in {fullpath}'
)
elif fullname in keys:
hist = f[fullname]
kratsg marked this conversation as resolved.
Show resolved Hide resolved
else:
raise KeyError(
f'Both {name} and {fullname} were tried and not found in {fullpath}'
)
return hist.to_numpy()[0].tolist(), extract_error(hist)


Expand Down
20 changes: 16 additions & 4 deletions tests/test_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,10 +201,22 @@ def test_import_measurement_gamma_bins(const):
pyhf.readxml.process_measurements(toplvl)


def test_import_prepHistFactory():
parsed_xml = pyhf.readxml.parse(
'validation/xmlimport_input/config/example.xml', 'validation/xmlimport_input/'
)
@pytest.mark.parametrize(
"configfile,rootdir",
[
(
'validation/xmlimport_input/config/example.xml',
'validation/xmlimport_input/',
),
(
'validation/xmlimport_input4/config/example.xml',
'validation/xmlimport_input4/',
),
],
ids=['xmlimport_input', 'xmlimport_input_histoPath'],
)
def test_import_prepHistFactory(configfile, rootdir):
parsed_xml = pyhf.readxml.parse(configfile, rootdir)

# build the spec, strictly checks properties included
spec = {
Expand Down
160 changes: 160 additions & 0 deletions validation/xmlimport_input4/config/HistFactorySchema.dtd
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@

<!-- The top level combination spec -->
<!-- OutputFilePrefix: Prefix to the output root file to be created (inspection histograms) -->
<!-- Mode: Type of the analysis -->
<!ELEMENT Combination (Function*,Input+,Measurement*)>
<!ATTLIST Combination
OutputFilePrefix CDATA #REQUIRED
Mode CDATA #IMPLIED
>

<!-- Input files detailing the channels. One channel per file -->
<!ELEMENT Function EMPTY>
<!ATTLIST Function
Name CDATA #REQUIRED
Expression CDATA #REQUIRED
Dependents CDATA #REQUIRED
>

<!-- Input files detailing the channels. One channel per file -->
<!ELEMENT Input (#PCDATA) >

<!-- Configuration for each measurement -->
<!-- Name: to be used as the heading in the table -->
<!-- Lumi: the luminosity of the measurement -->
<!-- LumiRelErr: the relative error known for the lumi -->
<!-- BinLow: the lowest bin number used for the measurement (inclusive) -->
<!-- BinHigh: the highest bin number used for the measurement (exclusive) -->
<!-- Mode: type of the measurement (a closed list of ...) -->
<!-- ExportOnly: if "True" skip fit, only export model -->
<!ELEMENT Measurement (POI,ParamSetting*,ConstraintTerm*) >
<!ATTLIST Measurement
Name CDATA #REQUIRED
Lumi CDATA #REQUIRED
LumiRelErr CDATA #REQUIRED
BinLow CDATA #IMPLIED
BinHigh CDATA #IMPLIED
Mode CDATA #IMPLIED
ExportOnly CDATA #IMPLIED
>

<!-- Specify what you are measuring. Corresponds to the name specified in the construction
of the model in the channel setup. Typically the NormFactor for xsec measurements -->
<!ELEMENT POI (#PCDATA) >

<!-- Specify what parameters are fixed, or have particular value -->
<!-- Val: set the value of the parameter -->
<!-- Const: set this parameter constant -->
<!ELEMENT ParamSetting (#PCDATA)>
<!ATTLIST ParamSetting
Val CDATA #IMPLIED
Const CDATA #IMPLIED
>

<!-- Specify an alternative shape to use for given constraint terms (Gaussian is used if this is not specified) -->
<!-- Type: can be Gamma or Uniform -->
<!-- RelativeUncertainty: relative uncertainty on the shape -->
<!ELEMENT ConstraintTerm (#PCDATA)>
<!ATTLIST ConstraintTerm
Type CDATA #REQUIRED
RelativeUncertainty CDATA #IMPLIED
>

<!-- Top element for channels. InputFile, HistoName and HistoPath
can be set at this level in which case they will become defaul to
all subsequent elements. Otherwise they can be set in individual
subelements -->
<!ELEMENT Channel (Data*,StatErrorConfig*,Sample+)>
<!-- InputFile: input file where the input histogram can be found (use abs path) -->
<!-- HistoPath: the path (within the root file) where the histogram can be found -->
<!-- HistoName: the name of the histogram to be used for this (and following in not overridden) item -->
<!ATTLIST Channel
Name CDATA #REQUIRED
InputFile CDATA #IMPLIED
HistoPath CDATA #IMPLIED
HistoName CDATA #IMPLIED
>

<!-- Data to be fit. If you don't provide it, Asimov data will be created -->
<!-- InputFile: any item set here will override the configuration for the subelements.
For this element there is no sublemenents so the setting will only have local effects -->
<!ELEMENT Data EMPTY>
<!ATTLIST Data
InputFile CDATA #IMPLIED
HistoPath CDATA #IMPLIED
HistoName CDATA #IMPLIED
>

<!ELEMENT StatErrorConfig EMPTY>
<!ATTLIST StatErrorConfig
RelErrorThreshold CDATA #IMPLIED
ConstraintType CDATA #IMPLIED
>


<!-- Sample elements are made up of systematic variations -->
<!ELEMENT Sample (StatError | HistoSys | OverallSys | ShapeSys | NormFactor | ShapeFactor)*>
<!ATTLIST Sample
Name CDATA #REQUIRED
InputFile CDATA #IMPLIED
HistoName CDATA #IMPLIED
HistoPath CDATA #IMPLIED
NormalizeByTheory CDATA #IMPLIED
>

<!-- Systematics for which the variation is provided by histograms -->
<!ELEMENT StatError EMPTY>
<!ATTLIST StatError
Activate CDATA #REQUIRED
HistoName CDATA #IMPLIED
InputFile CDATA #IMPLIED
HistoPath CDATA #IMPLIED
>

<!ELEMENT HistoSys EMPTY>
<!ATTLIST HistoSys
Name CDATA #REQUIRED
InputFile CDATA #IMPLIED
HistoFileHigh CDATA #IMPLIED
HistoPathHigh CDATA #IMPLIED
HistoNameHigh CDATA #IMPLIED
HistoFileLow CDATA #IMPLIED
HistoPathLow CDATA #IMPLIED
HistoNameLow CDATA #IMPLIED
>

<!-- Systematics for which the variation is provided by simple overall scaling -->
<!ELEMENT OverallSys EMPTY>
<!ATTLIST OverallSys
Name CDATA #REQUIRED
High CDATA #REQUIRED
Low CDATA #REQUIRED
>

<!-- Systematics for which the variation is provided by simple overall scaling -->
<!ELEMENT ShapeSys EMPTY>
<!ATTLIST ShapeSys
Name CDATA #REQUIRED
HistoName CDATA #REQUIRED
HistoPath CDATA #IMPLIED
InputFile CDATA #IMPLIED
ConstraintType CDATA #IMPLIED
>

<!-- Scaling factor, which may be the parameter of interest for cross section measurements-->
<!ELEMENT NormFactor EMPTY>
<!ATTLIST NormFactor
Name CDATA #REQUIRED
Val CDATA #REQUIRED
High CDATA #REQUIRED
Low CDATA #REQUIRED
Const CDATA #IMPLIED
>


<!-- Systematics for which the variation is provided by simple overall scaling -->
<!ELEMENT ShapeFactor EMPTY>
<!ATTLIST ShapeFactor
Name CDATA #REQUIRED
>

26 changes: 26 additions & 0 deletions validation/xmlimport_input4/config/example.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<!--
//============================================================================
// Name : example.xml
//============================================================================
-->

<!--
Top-level configuration, details for the example channel are in example_channel.xml.
This is the input file to the executable.

Note: Config.dtd needs to be accessible. It can be found in ROOT release area.
The file system path is relative to location of this XML file, not the executable.
-->

<!DOCTYPE Combination SYSTEM 'HistFactorySchema.dtd'>

<Combination OutputFilePrefix="./results/example" >

<Input>./config/example_channel.xml</Input>

<Measurement Name="GaussExample" Lumi="1." LumiRelErr="0.1" >
<POI>SigXsecOverSM</POI>
<ParamSetting Const="True">Lumi alpha_syst1</ParamSetting>
</Measurement>

</Combination>
22 changes: 22 additions & 0 deletions validation/xmlimport_input4/config/example_channel.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<!DOCTYPE Channel SYSTEM 'HistFactorySchema.dtd'>

<Channel Name="channel1" InputFile="./data/example.root" >
<Data HistoName="data" HistoPath="" />

<!-- Set the StatError type to Poisson. Can also be Gaussian -->
<StatErrorConfig RelErrorThreshold="0.05" ConstraintType="Poisson" />

<Sample Name="signal" HistoPath="/signal" HistoName="hsignal_channel1_obs_cuts">
<OverallSys Name="syst1" High="1.05" Low="0.95"/>
<NormFactor Name="SigXsecOverSM" Val="1" Low="0." High="3." />
</Sample>
<Sample Name="background1" HistoPath="/background" NormalizeByTheory="True" HistoName="hbackground1_obs_cuts">
<StatError Activate="True" HistoName="hbackground1_statUncert" HistoPath="/background" />
<OverallSys Name="syst2" Low="0.95" High="1.05"/>
</Sample>
<Sample Name="background2" HistoPath="/background" NormalizeByTheory="True" HistoName="hbackground2_obs_cuts">
<StatError Activate="True" /> <!-- Use Default Histogram Errors as input to StatError -->
<OverallSys Name="syst3" Low="0.95" High="1.05"/>
<!-- <HistoSys Name="syst4" HistoNameHigh="HighHistForSyst4" HistoNameLow="LowHistForSyst4"/>-->
</Sample>
</Channel>
Binary file added validation/xmlimport_input4/data/example.root
Binary file not shown.