Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DF][RDatasetSpec] Minimal set of initial functionality #3

Closed
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 72 additions & 20 deletions tree/dataframe/inc/ROOT/RDF/RDatasetSpec.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -13,44 +13,96 @@

#include <string>
#include <vector>
#include <limits>
#include <stdexcept> // std::logic_error

#include <RtypesCore.h>
#include <ROOT/InternalTreeUtils.hxx>

namespace ROOT {

namespace RDF {

struct RDatasetSpec {
std::string fDatasetName{}; ///< The name of the dataset to process.
/**
* A list of file names.
* They can contain the globbing characters supported by TChain. See TChain::Add for more information.
*/
std::vector<std::string> fFileNameGlobs{};
ULong64_t fStartEntry{}; ///< The entry where the dataset processing should start (inclusive).
ULong64_t fEndEntry{}; ///< The entry where the dataset processing should end (exclusive).
std::vector<std::string> fDefaultColumns{}; ///< A list of column names to process in the dataset.

struct REntryRange {
ULong64_t fStartEntry{0};
ULong64_t fEndEntry{std::numeric_limits<ULong64_t>::max()};
REntryRange() {}
REntryRange(ULong64_t endEntry) : fEndEntry(endEntry) {}
REntryRange(ULong64_t startEntry, ULong64_t endEntry)
: fStartEntry(startEntry),
fEndEntry(endEntry >= startEntry
? endEntry
: throw std::logic_error("RDatasetSpec: fStartEntry cannot be larger than fEndEntry."))
{
}
};

/**
* A list of names of trees.
* This list should go in lockstep with fFileNameGlobs, only in case this dataset is a TChain where each file
* contains its own tree with a different name from the global name of the dataset.
*/
std::vector<std::string> fSubTreeNames{};
std::vector<std::string> fTreeNames{};

/**
* A list of file names.
* They can contain the globbing characters supported by TChain. See TChain::Add for more information.
*/
std::vector<std::string> fFileNameGlobs{};
ikabadzhov marked this conversation as resolved.
Show resolved Hide resolved

ULong64_t fStartEntry{}; ///< The entry where the dataset processing should start (inclusive).
ULong64_t fEndEntry{}; ///< The entry where the dataset processing should end (exclusive).

ROOT::Internal::TreeUtils::RFriendInfo fFriendInfo{}; ///< List of friends

RDatasetSpec(const std::string &treeName, const std::string &fileName, REntryRange entryRange = {})
: fTreeNames(std::vector<std::string>{treeName}), fFileNameGlobs(std::vector<std::string>{fileName}),
fStartEntry(entryRange.fStartEntry), fEndEntry(entryRange.fEndEntry)
{
}

RDatasetSpec(const std::string &treeName, const std::vector<std::string> &fileNames, REntryRange entryRange = {})
: fTreeNames(std::vector<std::string>{treeName}), fFileNameGlobs(fileNames), fStartEntry(entryRange.fStartEntry),
fEndEntry(entryRange.fEndEntry)
{
}

RDatasetSpec(const std::vector<std::string> &treeNames, const std::vector<std::string> &fileNames,
REntryRange entryRange = {})
: fTreeNames(
fileNames.size() != treeNames.size() && treeNames.size() != 1
? throw std::logic_error("RDatasetSpec exepcts either N trees and N files, or 1 tree and N files.")
: treeNames),
fFileNameGlobs(fileNames), fStartEntry(entryRange.fStartEntry), fEndEntry(entryRange.fEndEntry)
{
}

void AddFriend(const std::string &treeName, const std::string &fileName)
{
fFriendInfo.fFriendNames.emplace_back(
treeName, treeName); // TODO: user might specify alias, now it is the tree name by default
ikabadzhov marked this conversation as resolved.
Show resolved Hide resolved
fFriendInfo.fFriendFileNames.emplace_back(std::vector<std::string>{fileName});
fFriendInfo.fFriendChainSubNames.emplace_back(std::vector<std::string>{}); // this is a tree
ikabadzhov marked this conversation as resolved.
Show resolved Hide resolved
}

RDatasetSpec(const std::string &datasetName, const std::string &fileName, ULong64_t startEntry = 0,
ULong64_t endEntry = 0, const std::vector<std::string> &defaultColumns = {},
const std::vector<std::string> &subTreenames = {})
: fDatasetName(datasetName), fFileNameGlobs(std::vector<std::string>{fileName}), fStartEntry(startEntry),
fEndEntry(endEntry), fDefaultColumns(defaultColumns), fSubTreeNames(subTreenames)
void AddFriend(const std::string &treeName, const std::vector<std::string> &fileNames)
{
fFriendInfo.fFriendNames.emplace_back(
ikabadzhov marked this conversation as resolved.
Show resolved Hide resolved
treeName, treeName); // TODO: user might specify alias, now it is the tree name by default
fFriendInfo.fFriendFileNames.emplace_back(fileNames);
fFriendInfo.fFriendChainSubNames.emplace_back(std::vector<std::string>{treeName}); // now this is a chain
ikabadzhov marked this conversation as resolved.
Show resolved Hide resolved
}

RDatasetSpec(const std::string &datasetName, const std::vector<std::string> &fileNames, ULong64_t startEntry = 0,
ULong64_t endEntry = 0, const std::vector<std::string> &defaultColumns = {},
const std::vector<std::string> &subTreenames = {})
: fDatasetName(datasetName), fFileNameGlobs(fileNames), fStartEntry(startEntry), fEndEntry(endEntry),
fDefaultColumns(defaultColumns), fSubTreeNames(subTreenames)
void AddFriend(const std::vector<std::string> &treeNames, const std::vector<std::string> &fileNames)
{
if (fileNames.size() != treeNames.size() && treeNames.size() != 1)
throw std::logic_error("RDatasetSpec's friend exepcts either N trees and N files, or 1 tree and N files.");
ikabadzhov marked this conversation as resolved.
Show resolved Hide resolved
fFriendInfo.fFriendNames.emplace_back(
treeNames[0], treeNames[0]); // TODO: user might specify alias, now it is the FIRST tree name by default
ikabadzhov marked this conversation as resolved.
Show resolved Hide resolved
fFriendInfo.fFriendFileNames.emplace_back(fileNames);
fFriendInfo.fFriendChainSubNames.emplace_back(treeNames); // now this is a chain
}
};

Expand Down
4 changes: 3 additions & 1 deletion tree/dataframe/inc/ROOT/RDF/RLoopManager.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ class RLoopManager : public RNodeBase {
std::shared_ptr<TTree> fTree{nullptr};
const ColumnNames_t fDefaultColumns;
ULong64_t fStartEntry{0};
ULong64_t fEndEntry{0};
ULong64_t fEndEntry{std::numeric_limits<ULong64_t>::max()};
ikabadzhov marked this conversation as resolved.
Show resolved Hide resolved
const ULong64_t fNEmptyEntries{0};
const unsigned int fNSlots{1};
bool fMustRunNamedFilters{true};
Expand All @@ -131,6 +131,8 @@ class RLoopManager : public RNodeBase {
std::vector<ROOT::RDF::SampleCallback_t> fSampleCallbacks;
RDFInternal::RNewSampleNotifier fNewSampleNotifier;
std::vector<ROOT::RDF::RSampleInfo> fSampleInfos;
ROOT::Internal::TreeUtils::RFriendInfo fFriendInfo{};
std::vector<std::unique_ptr<TTree>> fFriends; ///< Friends of the tree/chain, if present
ikabadzhov marked this conversation as resolved.
Show resolved Hide resolved
unsigned int fNRuns{0}; ///< Number of event loops run

/// Registry of per-slot value pointers for booked data-source columns
Expand Down
57 changes: 45 additions & 12 deletions tree/dataframe/src/RLoopManager.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -360,15 +360,14 @@ RLoopManager::RLoopManager(std::unique_ptr<RDataSource> ds, const ColumnNames_t
}

RLoopManager::RLoopManager(const ROOT::RDF::RDatasetSpec &spec)
: fDefaultColumns(spec.fDefaultColumns), fStartEntry(spec.fStartEntry), fEndEntry(spec.fEndEntry),
fNSlots(RDFInternal::GetNSlots()),
: fStartEntry(spec.fStartEntry), fEndEntry(spec.fEndEntry), fNSlots(RDFInternal::GetNSlots()),
fLoopType(ROOT::IsImplicitMTEnabled() ? ELoopType::kROOTFilesMT : ELoopType::kROOTFiles),
fNewSampleNotifier(fNSlots), fSampleInfos(fNSlots)
fNewSampleNotifier(fNSlots), fSampleInfos(fNSlots), fFriendInfo(spec.fFriendInfo)
{
// A TChain has a global name
auto chain = std::make_shared<TChain>(spec.fDatasetName.c_str());
auto chain = std::make_shared<TChain>(spec.fTreeNames[0].c_str()); // use the first tree name (FOR NOW)
ikabadzhov marked this conversation as resolved.
Show resolved Hide resolved

if (spec.fSubTreeNames.empty()){
if (spec.fTreeNames.size() == 1) {
// The global name of the chain is also the name of each tree in the list
// of files that make the chain.
for (const auto &f : spec.fFileNameGlobs)
Expand All @@ -377,13 +376,46 @@ RLoopManager::RLoopManager(const ROOT::RDF::RDatasetSpec &spec)
// Some other times, each different file has its own tree name, we need to
// reconstruct the full path to the tree in each file and pass that to
// TChain::Add
auto nfiles = spec.fFileNameGlobs.size();
for (decltype(nfiles) i = 0; i < nfiles; i++){
const auto fullpath = spec.fFileNameGlobs[i] + "?#" + spec.fSubTreeNames[i];
for (auto i = 0u; i < spec.fFileNameGlobs.size(); i++) {
const auto fullpath = spec.fFileNameGlobs[i] + "?#" + spec.fTreeNames[i];
chain->Add(fullpath.c_str());
}
}
SetTree(chain);

const auto &friendNames = fFriendInfo.fFriendNames;
ikabadzhov marked this conversation as resolved.
Show resolved Hide resolved
const auto &friendFileNames = fFriendInfo.fFriendFileNames;
const auto &friendChainSubNames = fFriendInfo.fFriendChainSubNames;
const auto nFriends = friendNames.size();

for (auto i = 0u; i < nFriends; ++i) {
const auto &thisFriendNameAlias = friendNames[i];
const auto &thisFriendName = thisFriendNameAlias.first;
const auto &thisFriendAlias = thisFriendNameAlias.second;
const auto &thisFriendFiles = friendFileNames[i];
const auto &thisFriendChainSubNames = friendChainSubNames[i];

// Build a friend chain
auto frChain = std::make_unique<TChain>(thisFriendName.c_str());
const auto nFileNames = friendFileNames[i].size();
if (thisFriendChainSubNames.empty()) {
// If there are no chain subnames, the friend was a TTree. It's safe
// to add to the chain the filename directly.
for (auto j = 0u; j < nFileNames; ++j) {
frChain->Add(thisFriendFiles[j].c_str());
}
} else {
// Otherwise, the new friend chain needs to be built using the nomenclature
// "filename/treename" as argument to `TChain::Add`
for (auto j = 0u; j < nFileNames; ++j) {
frChain->Add((thisFriendFiles[j] + "?#" + thisFriendChainSubNames[j]).c_str());
}
}

// Make it friends with the main chain
fTree->AddFriend(frChain.get(), thisFriendAlias.c_str());
fFriends.emplace_back(std::move(frChain));
}
}

struct RSlotRAII {
Expand Down Expand Up @@ -501,12 +533,13 @@ void RLoopManager::RunTreeProcessorMT()
/// Run event loop over one or multiple ROOT files, in sequence.
void RLoopManager::RunTreeReader()
{
if (fEndEntry == fStartEntry) // empty range => no work needed
ikabadzhov marked this conversation as resolved.
Show resolved Hide resolved
return;

TTreeReader r(fTree.get(), fTree->GetEntryList());

if (fEndEntry > fStartEntry) {
// User provided a valid entry range for the tree
r.SetEntriesRange(fStartEntry, fEndEntry);
}
if (r.SetEntriesRange(fStartEntry, fEndEntry)) // returning > 0 indicates invalid fStartEntry
throw std::runtime_error("RLoopManager: fStartEntry cannot be larger than the number of entries.");
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure this is the only case of invalid start entry for TTreeReader?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

afaik yes. Because in case fStartEntry > fEndEntry I would have errored out in ctor. Moreover, if the range is fStartEntry < "actual last entry" < fEndEntry then SetEntriesRange returns 0 and will output the entries starting from the fStartEntry up to the the actual last entry.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doublechecking https://github.com/root-project/root/blob/a402af862ace9573193500bcccaa8b4d56cc6632/tree/treeplayer/src/TTreeReader.cxx#L411 this looks like the only invalid thing that can go wrong at this stage.


if (0 == fTree->GetEntriesFast())
return;
Expand Down
Loading