Skip to content

Commit

Permalink
[DF][RDatasetSpec] Initial version of friend trees handling
Browse files Browse the repository at this point in the history
  • Loading branch information
ikabadzhov committed Jun 3, 2022
1 parent 11a7f9a commit f67e9f7
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 9 deletions.
31 changes: 29 additions & 2 deletions tree/dataframe/inc/ROOT/RDF/RDatasetSpec.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <stdexcept> // std::logic_error

#include <RtypesCore.h>
#include <ROOT/InternalTreeUtils.hxx>

namespace ROOT {

Expand Down Expand Up @@ -49,12 +50,13 @@ struct RDatasetSpec {
* A list of file names.
* They can contain the globbing characters supported by TChain. See TChain::Add for more information.
*/

std::vector<std::string> fFileNameGlobs{};

ULong64_t fStartEntry{}; ///< The entry where the dataset processing should start (inclusive).
ULong64_t fEndEntry{}; ///< The entry where the dataset processing should end (exclusive).

ROOT::Internal::TreeUtils::RFriendInfo fFriendInfo{}; ///< List of friends

RDatasetSpec(const std::string &treeName, const std::string &fileName, REntryRange entryRange = {})
: fTreeNames(std::vector<std::string>{treeName}), fFileNameGlobs(std::vector<std::string>{fileName}),
fStartEntry(entryRange.fStartEntry), fEndEntry(entryRange.fEndEntry)
Expand All @@ -71,11 +73,36 @@ struct RDatasetSpec {
REntryRange entryRange = {})
: fTreeNames(
fileNames.size() != treeNames.size() && treeNames.size() != 1
? throw std::runtime_error("RDatasetSpec exepcts either N trees and N files, or 1 tree and N files.")
? throw std::logic_error("RDatasetSpec exepcts either N trees and N files, or 1 tree and N files.")
: treeNames),
fFileNameGlobs(fileNames), fStartEntry(entryRange.fStartEntry), fEndEntry(entryRange.fEndEntry)
{
}

void AddFriend(const std::string &treeName, const std::string &fileName)
{
fFriendInfo.fFriendNames.emplace_back(treeName, treeName); // TODO: user might specify alias, now it is the tree name by default
fFriendInfo.fFriendFileNames.emplace_back(std::vector<std::string>{fileName});
fFriendInfo.fFriendChainSubNames.emplace_back(std::vector<std::string>{}); // this is a tree

}

void AddFriend(const std::string &treeName, const std::vector<std::string> &fileNames)
{
fFriendInfo.fFriendNames.emplace_back(treeName, treeName); // TODO: user might specify alias, now it is the tree name by default
fFriendInfo.fFriendFileNames.emplace_back(fileNames);
fFriendInfo.fFriendChainSubNames.emplace_back(std::vector<std::string>{treeName}); // now this is a chain

}

void AddFriend(const std::vector<std::string> &treeNames, const std::vector<std::string> &fileNames)
{
if (fileNames.size() != treeNames.size() && treeNames.size() != 1)
throw std::logic_error("RDatasetSpec's friend exepcts either N trees and N files, or 1 tree and N files.");
fFriendInfo.fFriendNames.emplace_back(treeNames[0], treeNames[0]); // TODO: user might specify alias, now it is the FIRST tree name by default
fFriendInfo.fFriendFileNames.emplace_back(fileNames);
fFriendInfo.fFriendChainSubNames.emplace_back(treeNames); // now this is a chain
}
};

} // namespace RDF
Expand Down
2 changes: 2 additions & 0 deletions tree/dataframe/inc/ROOT/RDF/RLoopManager.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ class RLoopManager : public RNodeBase {
std::vector<ROOT::RDF::SampleCallback_t> fSampleCallbacks;
RDFInternal::RNewSampleNotifier fNewSampleNotifier;
std::vector<ROOT::RDF::RSampleInfo> fSampleInfos;
ROOT::Internal::TreeUtils::RFriendInfo fFriendInfo{};
std::vector<std::unique_ptr<TTree>> fFriends; ///< Friends of the tree/chain, if present
unsigned int fNRuns{0}; ///< Number of event loops run

/// Registry of per-slot value pointers for booked data-source columns
Expand Down
56 changes: 51 additions & 5 deletions tree/dataframe/src/RLoopManager.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -362,14 +362,60 @@ RLoopManager::RLoopManager(std::unique_ptr<RDataSource> ds, const ColumnNames_t
RLoopManager::RLoopManager(const ROOT::RDF::RDatasetSpec &spec)
: fStartEntry(spec.fStartEntry), fEndEntry(spec.fEndEntry), fNSlots(RDFInternal::GetNSlots()),
fLoopType(ROOT::IsImplicitMTEnabled() ? ELoopType::kROOTFilesMT : ELoopType::kROOTFiles),
fNewSampleNotifier(fNSlots), fSampleInfos(fNSlots)
fNewSampleNotifier(fNSlots), fSampleInfos(fNSlots), fFriendInfo(spec.fFriendInfo)
{
auto chain = std::make_shared<TChain>();
for (auto i = 0u; i < spec.fFileNameGlobs.size(); ++i) {
const auto fullpath = spec.fFileNameGlobs[i] + "?#" + spec.fTreeNames[spec.fTreeNames.size() == 1 ? 0 : i];
chain->Add(fullpath.c_str());
// A TChain has a global name
auto chain = std::make_shared<TChain>(spec.fTreeNames[0].c_str()); // use the first tree name (FOR NOW)

if (spec.fTreeNames.size() == 1){
// The global name of the chain is also the name of each tree in the list
// of files that make the chain.
for (const auto &f : spec.fFileNameGlobs)
chain->Add(f.c_str());
} else {
// Some other times, each different file has its own tree name, we need to
// reconstruct the full path to the tree in each file and pass that to
// TChain::Add
for (auto i = 0u; i < spec.fFileNameGlobs.size(); i++){
const auto fullpath = spec.fFileNameGlobs[i] + "?#" + spec.fTreeNames[i];
chain->Add(fullpath.c_str());
}
}
SetTree(chain);

const auto &friendNames = fFriendInfo.fFriendNames;
const auto &friendFileNames = fFriendInfo.fFriendFileNames;
const auto &friendChainSubNames = fFriendInfo.fFriendChainSubNames;
const auto nFriends = friendNames.size();

for (auto i = 0u; i < nFriends; ++i) {
const auto &thisFriendNameAlias = friendNames[i];
const auto &thisFriendName = thisFriendNameAlias.first;
const auto &thisFriendAlias = thisFriendNameAlias.second;
const auto &thisFriendFiles = friendFileNames[i];
const auto &thisFriendChainSubNames = friendChainSubNames[i];

// Build a friend chain
auto frChain = std::make_unique<TChain>(thisFriendName.c_str());
const auto nFileNames = friendFileNames[i].size();
if (thisFriendChainSubNames.empty()) {
// If there are no chain subnames, the friend was a TTree. It's safe
// to add to the chain the filename directly.
for (auto j = 0u; j < nFileNames; ++j) {
frChain->Add(thisFriendFiles[j].c_str());
}
} else {
// Otherwise, the new friend chain needs to be built using the nomenclature
// "filename/treename" as argument to `TChain::Add`
for (auto j = 0u; j < nFileNames; ++j) {
frChain->Add((thisFriendFiles[j] + "?#" + thisFriendChainSubNames[j]).c_str());
}
}

// Make it friends with the main chain
fTree->AddFriend(frChain.get(), thisFriendAlias.c_str());
fFriends.emplace_back(std::move(frChain));
}
}

struct RSlotRAII {
Expand Down
23 changes: 21 additions & 2 deletions tree/dataframe/test/dataframe_datasetspec.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@ TEST(RDFDatasetSpec, SingleFileSingleColConstructor)
EXPECT_THROW(
try {
RDatasetSpec({"tree"s, "anothertree"s}, {"file.root"s}, {2, 4});
} catch (const std::runtime_error &err) {
} catch (const std::logic_error &err) {
EXPECT_EQ(std::string(err.what()), "RDatasetSpec exepcts either N trees and N files, or 1 tree and N files.");
throw;
},
std::runtime_error);
std::logic_error);

// specify range [2, 2) (3 is a valid index) => range is disregarded
const auto dfRDS7 = RDataFrame(RDatasetSpec("tree", "file.root", {2, 2})).Display<int>({"x"})->AsString();
Expand Down Expand Up @@ -281,3 +281,22 @@ TEST(RDFDatasetSpec, MultipleFiles)

gSystem->Exec("rm file0.root file1.root file2.root");
}

// TODO: test the friends
/*
TEST(RDFDatasetSpec, FriendTrees)
{
auto dfWriter0 = RDataFrame(3)
.Define("x", [](ULong64_t e) { return int(e); }, {"rdfentry_"})
.Define("y", [](ULong64_t e) { return int(e) + 1; }, {"rdfentry_"});
dfWriter0.Snapshot<int, int>("treeA", "file0.root", {"x", "y"});
dfWriter0.Snapshot<int, int>("treeA", "file1.root", {"x", "y"});
dfWriter0.Snapshot<int, int>("treeB", "file2.root", {"x", "y"}); // different tree's name
auto dfWriter1 = RDataFrame(2)
.Define("x", [](ULong64_t e) { return int(e) + 2; }, {"rdfentry_"})
.Define("y", [](ULong64_t e) { return int(e) + 3; }, {"rdfentry_"});
dfWriter0.Snapshot<int, int>("treeA", "file3.root", {"x", "y"});
dfWriter0.Snapshot<int, int>("treeB", "file4.root", {"x", "y"});
dfWriter0.Snapshot<int, int>("treeC", "file5.root", {"x", "y"});
}
*/

0 comments on commit f67e9f7

Please sign in to comment.