Skip to content

Commit

Permalink
[DF][RDatasetSpec] Avoid specifying chain name
Browse files Browse the repository at this point in the history
  • Loading branch information
ikabadzhov committed May 16, 2022
1 parent 4b066cc commit 2629171
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 55 deletions.
36 changes: 21 additions & 15 deletions tree/dataframe/inc/ROOT/RDF/RDatasetSpec.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -38,33 +38,39 @@ struct RDatasetSpec {
}
};

std::string fDatasetName{}; ///< The name of the dataset to process.
/**
* A list of names of trees.
* This list should go in lockstep with fFileNameGlobs, only in case this dataset is a TChain where each file
* contains its own tree with a different name from the global name of the dataset.
*/
std::vector<std::string> fTreeNames{};

/**
* A list of file names.
* They can contain the globbing characters supported by TChain. See TChain::Add for more information.
*/

std::vector<std::string> fFileNameGlobs{};

ULong64_t fStartEntry{}; ///< The entry where the dataset processing should start (inclusive).
ULong64_t fEndEntry{}; ///< The entry where the dataset processing should end (exclusive).

/**
* A list of names of trees.
* This list should go in lockstep with fFileNameGlobs, only in case this dataset is a TChain where each file
* contains its own tree with a different name from the global name of the dataset.
*/
std::vector<std::string> fSubTreeNames{};
RDatasetSpec(const std::string &treeName, const std::string &fileName, REntryRange entryRange = {})
: fTreeNames(std::vector<std::string>{treeName}), fFileNameGlobs(std::vector<std::string>{fileName}),
fStartEntry(entryRange.fStartEntry), fEndEntry(entryRange.fEndEntry)
{
}

RDatasetSpec(const std::string &datasetName, const std::string &fileName, REntryRange entryRange = {},
const std::vector<std::string> &subTreenames = {})
: fDatasetName(datasetName), fFileNameGlobs(std::vector<std::string>{fileName}),
fStartEntry(entryRange.fStartEntry), fEndEntry(entryRange.fEndEntry), fSubTreeNames(subTreenames)
RDatasetSpec(const std::string &treeName, const std::vector<std::string> &fileNames, REntryRange entryRange = {})
: fTreeNames(std::vector<std::string>{treeName}), fFileNameGlobs(fileNames), fStartEntry(entryRange.fStartEntry),
fEndEntry(entryRange.fEndEntry)
{
}

RDatasetSpec(const std::string &datasetName, const std::vector<std::string> &fileNames, REntryRange entryRange = {},
const std::vector<std::string> &subTreenames = {})
: fDatasetName(datasetName), fFileNameGlobs(fileNames), fStartEntry(entryRange.fStartEntry),
fEndEntry(entryRange.fEndEntry), fSubTreeNames(subTreenames)
RDatasetSpec(const std::vector<std::string> &treeNames, const std::vector<std::string> &fileNames,
REntryRange entryRange = {})
: fTreeNames(treeNames), fFileNameGlobs(fileNames), fStartEntry(entryRange.fStartEntry),
fEndEntry(entryRange.fEndEntry)
{
}
};
Expand Down
17 changes: 7 additions & 10 deletions tree/dataframe/src/RLoopManager.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -364,25 +364,22 @@ RLoopManager::RLoopManager(const ROOT::RDF::RDatasetSpec &spec)
fLoopType(ROOT::IsImplicitMTEnabled() ? ELoopType::kROOTFilesMT : ELoopType::kROOTFiles),
fNewSampleNotifier(fNSlots), fSampleInfos(fNSlots)
{
// A TChain has a global name
auto chain = std::make_shared<TChain>(spec.fDatasetName.c_str());

if (spec.fSubTreeNames.empty()){
// The global name of the chain is also the name of each tree in the list
// of files that make the chain.
if (spec.fTreeNames.size() == 1) { // a single tree (might be multiple files)
auto chain = std::make_shared<TChain>(spec.fTreeNames[0].c_str());
for (const auto &f : spec.fFileNameGlobs)
chain->Add(f.c_str());
SetTree(chain);
} else {
// Some other times, each different file has its own tree name, we need to
// reconstruct the full path to the tree in each file and pass that to
// TChain::Add
auto nfiles = spec.fFileNameGlobs.size();
for (decltype(nfiles) i = 0; i < nfiles; i++){
const auto fullpath = spec.fFileNameGlobs[i] + "?#" + spec.fSubTreeNames[i];
auto chain = std::make_shared<TChain>();
for (auto i = 0u; i < spec.fFileNameGlobs.size(); ++i) {
const auto fullpath = spec.fFileNameGlobs[i] + "?#" + spec.fTreeNames[i];
chain->Add(fullpath.c_str());
}
SetTree(chain);
}
SetTree(chain);
}

struct RSlotRAII {
Expand Down
42 changes: 12 additions & 30 deletions tree/dataframe/test/dataframe_datasetspec.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
using namespace ROOT;
using namespace ROOT::RDF;

using namespace std::literals; // remove ambiguity of using std::vector<std::string>-s and std::string-s

TEST(RDFDatasetSpec, SingleFileSingleColConstructor)
{
auto dfWriter = RDataFrame(5).Define("x", [](ULong64_t e) { return int(e); }, {"rdfentry_"});
Expand Down Expand Up @@ -46,24 +48,16 @@ TEST(RDFDatasetSpec, SingleFileSingleColConstructor)
const auto dfRDS1 = RDataFrame(RDatasetSpec("tree", "file.root", {2, 4})).Display<int>({"x"})->AsString();
EXPECT_EQ(dfRDS1, dfRange0);

// specify the treename as fourth argument => the first argument becomes the name of the chain of trees
const auto dfRDS4 = RDataFrame(RDatasetSpec("chain", "file.root", {2, 4}, {"tree"})).Display<int>({"x"})->AsString();
EXPECT_EQ(dfRDS4, dfRange0);

// specify the chain to have the same name as the tree
const auto dfRDS5 = RDataFrame(RDatasetSpec("tree", "file.root", {2, 4}, {"tree"})).Display<int>({"x"})->AsString();
EXPECT_EQ(dfRDS5, dfRange0);

// specify 2 trees, second tree is irrelevant, this is correct
const auto dfRDS6 =
RDataFrame(RDatasetSpec("chain", "file.root", {2, 4}, {"tree", "nottree"})).Display<int>({"x"})->AsString();
RDataFrame(RDatasetSpec({"tree"s, "nottree"s}, {"file.root"s}, {2, 4})).Display<int>({"x"})->AsString();
EXPECT_EQ(dfRDS6, dfRange0);

// specify 2 trees, first tree is irrelevant, this is wrong, emitting C++ error and ROOT error
EXPECT_THROW(
try {
ROOT_EXPECT_ERROR(
RDataFrame(RDatasetSpec("chain", "file.root", {2, 4}, {"nottree", "tree"})).Display<int>({"x"})->AsString(),
RDataFrame(RDatasetSpec({"nottree"s, "tree"s}, {"file.root"s}, {2, 4})).Display<int>({"x"})->AsString(),
"TChain::LoadTree", "Cannot find tree with name nottree in file file.root");
} catch (const std::runtime_error &err) {
EXPECT_EQ(std::string(err.what()),
Expand Down Expand Up @@ -102,8 +96,6 @@ TEST(RDFDatasetSpec, SingleFileSingleColConstructor)
"first entry out of range 0..5");
EXPECT_EQ(dfRDS10AsString, dfSimple);

using namespace std::literals; // remove ambiguity of using std::vector<std::string>-s and std::string-s

// test the second constructor, second argument is now a vector
const auto dfRDS13 = RDataFrame(RDatasetSpec("tree", {"file.root"s})).Display<int>({"x"})->AsString();
EXPECT_EQ(dfRDS13, dfSimple);
Expand Down Expand Up @@ -189,8 +181,6 @@ TEST(RDFDatasetSpec, SingleFileMultiColsConstructor)
},
std::logic_error);

using namespace std::literals; // remove ambiguity of using std::vector<std::string>-s and std::string-s

// test the second constructor, second argument is now a vector
const auto dfRDS12 = RDataFrame(RDatasetSpec("tree", {"file.root"s})).Display()->AsString();
EXPECT_EQ(dfRDS12, dfSimple);
Expand All @@ -217,36 +207,28 @@ TEST(RDFDatasetSpec, MultipleFiles)
const auto dfRange2 = RDataFrame(ch0).Range(2).Display<int, int>({"x", "y"})->AsString();
const auto dfEmpty = "+-----+---+---+\n| Row | x | y | \n| | | | \n+-----+---+---+\n";

using namespace std::literals;

// both files have the same tree, do not ask for chain
const auto dfRDS0 = RDataFrame(RDatasetSpec("treeA", {"file0.root"s, "file1.root"s})).Display()->AsString();
EXPECT_EQ(dfRDS0, dfSimple);

// both files have the same tree, but ask for chain
const auto dfRDS1 = RDataFrame(RDatasetSpec("chain", {"file0.root"s, "file1.root"s}, {0, 5}, {"treeA", "treeA"}))
.Display()
->AsString();
const auto dfRDS1 =
RDataFrame(RDatasetSpec({"treeA"s, "treeA"s}, {"file0.root"s, "file1.root"s}, {0, 5})).Display()->AsString();
EXPECT_EQ(dfRDS1, dfSimple);

// files have different chain name => need a chain
const auto dfRDS2 = RDataFrame(RDatasetSpec("chain", {"file1.root"s, "file2.root"s}, {0, 5}, {"treeA", "treeB"}))
.Display()
->AsString();
const auto dfRDS2 =
RDataFrame(RDatasetSpec({"treeA"s, "treeB"s}, {"file1.root"s, "file2.root"s}, {0, 5})).Display()->AsString();
EXPECT_EQ(dfRDS2, dfSimple);

// cases similar to above, but now range is applied, note that the range is global (i.e. not per tree, but per chain)
const auto dfRDS3 = RDataFrame(RDatasetSpec("treeA", {"file0.root"s, "file1.root"s}, {1, 2})).Display()->AsString();
EXPECT_EQ(dfRDS3, dfRange);

const auto dfRDS4 = RDataFrame(RDatasetSpec("chain", {"file0.root"s, "file1.root"s}, {1, 2}, {"treeA", "treeA"}))
.Display()
->AsString();
const auto dfRDS4 =
RDataFrame(RDatasetSpec({"treeA"s, "treeA"s}, {"file0.root"s, "file1.root"s}, {1, 2})).Display()->AsString();
EXPECT_EQ(dfRDS4, dfRange);

const auto dfRDS5 = RDataFrame(RDatasetSpec("chain", {"file1.root"s, "file2.root"s}, {1, 2}, {"treeA", "treeB"}))
.Display()
->AsString();
const auto dfRDS5 =
RDataFrame(RDatasetSpec({"treeA"s, "treeB"s}, {"file1.root"s, "file2.root"s}, {1, 2})).Display()->AsString();
EXPECT_EQ(dfRDS5, dfRange);

// specify irregular range [6, 7) (similar to above)
Expand Down

0 comments on commit 2629171

Please sign in to comment.