Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
* fix bug

* opti seqpool-cvm infershape

* opti fused-seqpool-cvm

Co-authored-by: liaoxiaochao <[email protected]>
  • Loading branch information
SmallBirdLiao and liaoxiaochao-bb authored Aug 29, 2022
1 parent 22839bb commit 6cfd041
Show file tree
Hide file tree
Showing 13 changed files with 254 additions and 75 deletions.
4 changes: 4 additions & 0 deletions paddle/fluid/framework/new_executor/new_executor_defs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,10 @@ bool InterpretercoreInferShapeContext::IsRunMKLDNNKernel() const {
}
}

Scope* InterpretercoreInferShapeContext::GetScopePtr() const {
return nullptr;
}

// TODO(paddle-dev): Can this be template?
std::vector<InferShapeVarPtr> InterpretercoreInferShapeContext::GetInputVarPtrs(
const std::string& name) const {
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/framework/new_executor/new_executor_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ class InterpretercoreInferShapeContext : public InferShapeContext {

bool IsRunMKLDNNKernel() const override;

Scope* GetScopePtr() const override;

// TODO(paddle-dev): Can this be template?
std::vector<InferShapeVarPtr> GetInputVarPtrs(
const std::string& name) const override;
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/framework/op_desc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,8 @@ class CompileTimeInferShapeContext : public InferShapeContext {

bool IsRunMKLDNNKernel() const override;

Scope* GetScopePtr() const override;

std::vector<proto::VarType::Type> GetInputsVarType(
const std::string &name) const override {
return GetVarTypes(Inputs(name));
Expand Down Expand Up @@ -947,6 +949,8 @@ bool CompileTimeInferShapeContext::IsRuntime() const { return false; }

bool CompileTimeInferShapeContext::IsRunMKLDNNKernel() const { return false; }

Scope* CompileTimeInferShapeContext::GetScopePtr() const { return nullptr;}

proto::VarType::Type CompileTimeInferShapeContext::GetVarType(
const std::string &name) const {
return block_.FindVarRecursive(name)->GetType();
Expand Down
10 changes: 7 additions & 3 deletions paddle/fluid/framework/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -949,6 +949,10 @@ bool RuntimeInferShapeContext::IsRunMKLDNNKernel() const {
}
}

Scope* RuntimeInferShapeContext::GetScopePtr() const {
return const_cast<Scope*>(scope_);
}

// TODO(paddle-dev): Can this be template?
std::vector<InferShapeVarPtr> RuntimeInferShapeContext::GetInputVarPtrs(
const std::string& name) const {
Expand Down Expand Up @@ -1224,13 +1228,13 @@ void OperatorWithKernel::InferShape(InferShapeContext* ctx) const {
void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
const platform::Place& place,
const RuntimeContext& ctx) const {
RuntimeInferShapeContext infer_shape_ctx(*this, ctx);
RuntimeInferShapeContext infer_shape_ctx(*this, ctx, scope);
this->Info().infer_shape_(&infer_shape_ctx);
}

void OperatorWithKernel::RuntimeInferShape(const Scope& scope) const {
RuntimeContext ctx(Inputs(), Outputs(), scope);
RuntimeInferShapeContext infer_shape_ctx(*this, ctx);
RuntimeInferShapeContext infer_shape_ctx(*this, ctx, scope);
this->Info().infer_shape_(&infer_shape_ctx);
}

Expand Down Expand Up @@ -1442,7 +1446,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
platform::RecordEvent record_event("infer_shape",
platform::TracerEventType::OperatorInner,
1, platform::EventRole::kInnerOp);
RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx, exec_scope);
this->Info().infer_shape_(&infer_shape_ctx);
}

Expand Down
8 changes: 5 additions & 3 deletions paddle/fluid/framework/operator.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,6 @@ class RuntimeContext {
RuntimeContext(const VariableValueMap& invars,
const VariableValueMap& outvars)
: inputs(invars), outputs(outvars) {}

VariableValueMap inputs;
VariableValueMap outputs;
};
Expand Down Expand Up @@ -715,8 +714,8 @@ class OperatorWithKernel : public OperatorBase {

class RuntimeInferShapeContext : public InferShapeContext {
public:
RuntimeInferShapeContext(const OperatorBase& op, const RuntimeContext& ctx)
: op_(op), ctx_(ctx) {}
RuntimeInferShapeContext(const OperatorBase& op, const RuntimeContext& ctx, const Scope& scope)
: op_(op), ctx_(ctx), scope_(&scope) {}

bool HasInput(const std::string &name) const override;
bool HasOutput(const std::string &name) const override;
Expand Down Expand Up @@ -760,6 +759,8 @@ class RuntimeInferShapeContext : public InferShapeContext {

bool IsRunMKLDNNKernel() const override;

Scope* GetScopePtr() const override;

std::vector<InferShapeVarPtr> GetInputVarPtrs(
const std::string &name) const override;
std::vector<InferShapeVarPtr> GetOutputVarPtrs(
Expand Down Expand Up @@ -789,6 +790,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
const std::vector<Variable*>& OutputVars(const std::string& name) const;
const OperatorBase& op_;
const RuntimeContext& ctx_;
const Scope* scope_;
};

extern bool OpSupportGPU(const std::string& op_type);
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/ps_gpu_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ int PSGPUWorker::OpRunAndShapeCheck(OperatorBase& op,
auto& after_dims = check_data.after_dims;
auto& after_lods = check_data.after_lods;
RuntimeContext ctx(op.Inputs(), op.Outputs(), scope);
RuntimeInferShapeContext infer_shape_ctx(op, ctx);
RuntimeInferShapeContext infer_shape_ctx(op, ctx, scope);
auto outnames = op.Outputs();
for (auto& var_name_item : outnames) {
pre_dims.push_back(infer_shape_ctx.GetOutputsDim(var_name_item.first));
Expand Down
19 changes: 19 additions & 0 deletions paddle/fluid/framework/scope.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,25 @@ Variable* Scope::FindVar(const std::string& name) const {
return FindVarInternal(name);
}

std::vector<Variable*> Scope::FindVarFromChild(const std::string& name) const {
std::vector<Variable*> ret;
{
SCOPE_VARS_READER_LOCK
auto it = vars_.find(name);
if (it != vars_.end()) {
ret.push_back(it->second.get());
}
}
{
SCOPE_KIDS_READER_LOCK
for (Scope* s : kids_) {
auto child_ret = s->FindVarFromChild(name);
ret.insert(ret.end(), child_ret.begin(), child_ret.end());
}
}
return ret;
}

Variable* Scope::GetVar(const std::string& name) const {
auto* var = FindVar(name);
PADDLE_ENFORCE_NOT_NULL(
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/framework/scope.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ class Scope : public ScopeBase {
/// Caller doesn't own the returned Variable.
Variable* FindVar(const std::string& name) const;

std::vector<Variable*> FindVarFromChild(const std::string& name) const;

// Get a variable in the scope or any of its ancestors. Enforce
/// the returned Variable is not nullptr
Variable* GetVar(const std::string& name) const;
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/framework/shape_inference.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ class InferShapeContext {

virtual bool IsRunMKLDNNKernel() const = 0;

virtual Scope* GetScopePtr() const = 0;

virtual std::vector<InferShapeVarPtr> GetInputVarPtrs(
const std::string &name) const = 0;
virtual std::vector<InferShapeVarPtr> GetOutputVarPtrs(
Expand Down
41 changes: 40 additions & 1 deletion paddle/fluid/framework/var_type_traits.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#include <unordered_map>
#include <vector>

#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/string_array.h"
Expand Down Expand Up @@ -98,6 +100,42 @@ class OrderedMultiDeviceLoDTensorBlockingQueueHolder;
namespace paddle {
namespace framework {

class GpuPinnedVector {
public:
GpuPinnedVector() {}
void cpu_to_pinedcpu(void* buf, size_t len) {
mem_cpu_ = memory::Alloc(phi::GPUPinnedPlace(), len);
memcpy(reinterpret_cast<char*>(mem_cpu_->ptr()), buf, len);
len_ = len;
}
void pinedcpu_to_gpu(paddle::gpuStream_t stream, phi::Place place) {
mem_gpu_ = memory::Alloc(place, len_);
cudaMemcpyAsync(reinterpret_cast<char*>(mem_gpu_->ptr()), reinterpret_cast<char*>(mem_cpu_->ptr()),
len_, cudaMemcpyHostToDevice, stream);
}
void cpu_to_gpu(void* buf, size_t len, paddle::gpuStream_t stream, phi::Place place) {
mem_cpu_ = memory::Alloc(phi::GPUPinnedPlace(), len);
memcpy(reinterpret_cast<char*>(mem_cpu_->ptr()), buf, len);
mem_gpu_ = memory::Alloc(place, len);
cudaMemcpyAsync(reinterpret_cast<char*>(mem_gpu_->ptr()), reinterpret_cast<char*>(mem_cpu_->ptr()),
len, cudaMemcpyHostToDevice, stream);
len_ = len;
}
template <typename Type>
Type* get_gpu_ptr() {
return reinterpret_cast<Type*>(mem_gpu_->ptr());
}
template <typename Type>
Type* get_cpu_ptr() {
return reinterpret_cast<Type*>(mem_cpu_->ptr());
}
private:
memory::allocation::AllocationPtr mem_cpu_;
memory::allocation::AllocationPtr mem_gpu_;
size_t len_;
};


const char *ToTypeName(int var_id);
const std::type_index &VarTraitIdToTypeIndex(int var_id);
int TypeIndexToVarTraitId(const std::type_index &type);
Expand Down Expand Up @@ -189,7 +227,8 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
#if defined(PADDLE_WITH_CNCL)
cnclCliqueId,
#endif
int, float, Vocab>;
int, float, Vocab,
GpuPinnedVector>;
template <typename T>
struct VarTypeTrait {
static_assert(VarTypeRegistry::IsRegistered<T>(), "Must be registered type");
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/imperative/infer_shape_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,10 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
return (op_kernel_type_ &&
(op_kernel_type_->data_layout_ == framework::DataLayout::kMKLDNN));
}

framework::Scope* GetScopePtr() const override {
return nullptr;
}

std::vector<framework::InferShapeVarPtr> GetInputVarPtrs(
const std::string& name) const override {
Expand Down
97 changes: 94 additions & 3 deletions paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,18 +62,20 @@ class FusedSeqpoolCVMOp : public framework::OperatorWithKernel {
if (ctx->IsRuntime()) {
int batch_size = -1;
auto inputs_tensor = ctx->GetInputVarPtrs("X");
uint64_t tmp_var_key = 0;
for (size_t i = 0; i < num_inputs; ++i) {
const auto dims = ins_dims[i];
int rank = dims.size();
int cur_batch_size = 0;
framework::Variable* x_var =
BOOST_GET(framework::Variable*, inputs_tensor[i]);
const auto& x_tensor = x_var->Get<LoDTensor>();
const auto& x_lod = x_tensor.lod();
const auto x_tensor = x_var->GetMutable<LoDTensor>();
tmp_var_key += (uint64_t)(x_tensor);
const auto& x_lod = x_tensor->lod();
if (x_lod.size() > 0) {
cur_batch_size = x_lod[0].size() - 1;
} else {
cur_batch_size = x_tensor.dims()[0];
cur_batch_size = x_tensor->dims()[0];
}
if (batch_size == -1) {
batch_size = cur_batch_size;
Expand All @@ -93,6 +95,41 @@ class FusedSeqpoolCVMOp : public framework::OperatorWithKernel {
}
outs_dims[i] = phi::make_ddim(out_dim);
}

//准备lod的gpu数据,不然放到computer里面会拖垮性能
{
auto scope = ctx->GetScopePtr();
auto& child_scope = scope->NewScope();
std::string var_name = "FusedSeqpoolCVMOp_";
var_name.append(std::to_string(tmp_var_key));
auto var = child_scope.Var(var_name);
paddle::framework::GpuPinnedVector* pin_ptr = var->GetMutable<paddle::framework::GpuPinnedVector>();

std::vector<size_t> mix_lods;
mix_lods.reserve(num_inputs * (batch_size + 1));
for (size_t i = 0; i < num_inputs; ++i) {
framework::Variable* x_var = BOOST_GET(framework::Variable*, inputs_tensor[i]);
const auto& x_tensor = x_var->Get<LoDTensor>();
const auto& x_lod = x_tensor.lod();
if (x_lod.size() != 0) {
PADDLE_ENFORCE_EQ(x_lod.size(), 1,
platform::errors::PreconditionNotMet(
"The lod size of all input should be 1, "
"please cheack"));
PADDLE_ENFORCE_EQ(x_lod[0].size(), batch_size + 1,
platform::errors::PreconditionNotMet(
"The lod[0] size of all input should be batch_size + 1, "
"please cheack"));
mix_lods.insert(mix_lods.end(), x_lod[0].begin(), x_lod[0].end());
} else {
mix_lods.push_back(0);
for (int i = 0; i < x_tensor.dims()[0]; i++) {
mix_lods.push_back(i + 1);
}
}
}
pin_ptr->cpu_to_pinedcpu(mix_lods.data(), mix_lods.size() * sizeof(size_t));
}
} else {
for (size_t i = 0; i < num_inputs; ++i) {
const auto dims = ins_dims[i];
Expand Down Expand Up @@ -222,6 +259,60 @@ class FusedSeqpoolCVMGradOp : public framework::OperatorWithKernel {
ctx->ShareLoD("X", framework::GradVarName("X"), i, i);
ctx->ShareDim("X", framework::GradVarName("X"), i, i);
}

//准备lod的gpu数据,不然放到computer里面会拖垮性能
if (ctx->IsRuntime()) {
auto inputs_tensor = ctx->GetOutputVarPtrs(framework::GradVarName("X"));
size_t num_inputs = inputs_tensor.size();
uint64_t tmp_var_key = 0;
framework::Variable* x_var = BOOST_GET(framework::Variable*, inputs_tensor[0]);
const LoDTensor* x_tensor = x_var->GetMutable<LoDTensor>();
int batch_size = x_tensor->lod().size() ? x_tensor->lod()[0].size() - 1 : x_tensor->dims()[0];

std::vector<size_t> mix_lods;
mix_lods.reserve(num_inputs * (batch_size + 1));
for (size_t i = 0; i < num_inputs; i++) {
x_var = BOOST_GET(framework::Variable*, inputs_tensor[i]);
x_tensor = x_var->GetMutable<LoDTensor>();
tmp_var_key += (uint64_t)(x_tensor);
const auto& x_lod = x_tensor->lod();
if (x_lod.size() != 0) {
PADDLE_ENFORCE_EQ(x_lod.size(), 1,
platform::errors::PreconditionNotMet(
"The lod size of all in_grad should be 1, "
"please cheack"));
PADDLE_ENFORCE_EQ(x_lod[0].size(), batch_size + 1,
platform::errors::PreconditionNotMet(
"The lod[0] size of all in_grad should be batch_size + 1, "
"please cheack"));
mix_lods.insert(mix_lods.end(), x_lod[0].begin(), x_lod[0].end());
} else {
mix_lods.push_back(0);
for (int i = 0; i < x_tensor->dims()[0]; i++) {
mix_lods.push_back(i + 1);
}
}
int cur_batch_size = x_tensor->lod().size() ? x_tensor->lod()[0].size() - 1 : x_tensor->dims()[0];
PADDLE_ENFORCE_EQ(batch_size, cur_batch_size,
platform::errors::PreconditionNotMet(
"The batch size of all in_grad should be same, "
"please cheack, last batchsize is %d, current "
"batchsize is %d",
batch_size, cur_batch_size));
}
PADDLE_ENFORCE_EQ(mix_lods.size(), num_inputs * (batch_size + 1),
platform::errors::PreconditionNotMet(
"please cheack"));

std::string var_name = "FusedSeqpoolCVMGradOp_";
var_name.append(std::to_string(tmp_var_key));
auto scope = ctx->GetScopePtr();
auto& child_scope = scope->NewScope();
auto var = child_scope.Var(var_name);
paddle::framework::GpuPinnedVector* pin_ptr = var->GetMutable<paddle::framework::GpuPinnedVector>();
pin_ptr->cpu_to_pinedcpu(mix_lods.data(), mix_lods.size() * sizeof(size_t));
}

}

protected:
Expand Down
Loading

0 comments on commit 6cfd041

Please sign in to comment.