Skip to content

Commit 7c96efe

Browse files
authored
[Inference] Add tuned trt_dynamic_shape mode. (#34806)
1 parent f5e430c commit 7c96efe

26 files changed

+929
-67
lines changed

paddle/fluid/inference/analysis/argument.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,12 @@ struct Argument {
212212
bool);
213213
DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
214214
DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool);
215+
DECL_ARGUMENT_FIELD(tensorrt_shape_range_info_path,
216+
TensorRtShapeRangeInfoPath, std::string);
217+
DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape, TensorRtTunedDynamicShape,
218+
bool);
219+
DECL_ARGUMENT_FIELD(tensorrt_allow_build_at_runtime,
220+
TensorRtAllowBuildAtRuntime, bool);
215221

216222
DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
217223
DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);

paddle/fluid/inference/analysis/ir_pass_manager.cc

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -146,24 +146,32 @@ void IRPassManager::CreatePasses(Argument *argument,
146146
pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
147147
pass->Set("use_static_engine", new bool(use_static_engine));
148148
pass->Set("model_from_memory", new bool(argument->model_from_memory()));
149+
150+
// tuned trt dynamic_shape
151+
pass->Set("trt_shape_range_info_path",
152+
new std::string(argument->tensorrt_shape_range_info_path()));
153+
pass->Set("trt_tuned_dynamic_shape",
154+
new bool(argument->tensorrt_tuned_dynamic_shape()));
155+
pass->Set("trt_allow_build_at_runtime",
156+
new bool(argument->tensorrt_allow_build_at_runtime()));
149157
pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
150158
argument->max_input_shape()));
151159
pass->Set("min_input_shape", new std::map<std::string, std::vector<int>>(
152160
argument->min_input_shape()));
153161
pass->Set("optim_input_shape",
154162
new std::map<std::string, std::vector<int>>(
155163
argument->optim_input_shape()));
156-
bool with_dynamic_shape = argument->max_input_shape().size() > 0 &&
157-
argument->min_input_shape().size() > 0 &&
158-
argument->optim_input_shape().size() > 0;
164+
bool with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
165+
argument->min_input_shape().size() > 0 &&
166+
argument->optim_input_shape().size() > 0) ||
167+
argument->tensorrt_tuned_dynamic_shape();
159168
pass->Set("with_dynamic_shape", new bool(with_dynamic_shape));
160169
pass->Set("trt_disabled_ops", new std::vector<std::string>(
161170
argument->tensorrt_disabled_ops()));
162171
pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla()));
163172
pass->Set("trt_dla_core", new int(argument->tensorrt_dla_core()));
164173
// Setting the disable_trt_plugin_fp16 to true means that TRT plugin will
165-
// not
166-
// run fp16.
174+
// not run fp16.
167175
pass->Set("disable_trt_plugin_fp16",
168176
new bool(argument->disable_trt_plugin_fp16()));
169177
} else if (pass_name == "dlnne_subgraph_pass") {

paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
cc_library(subgraph_util SRCS subgraph_util.cc DEPS subgraph_detector)
22

33
if (WITH_GPU AND TENSORRT_FOUND)
4-
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_util tensorrt_op_teller)
4+
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_util tensorrt_op_teller infer_io_utils)
55

66
set(analysis_deps ${analysis_deps}
77
subgraph_util tensorrt_subgraph_pass

paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "paddle/fluid/inference/tensorrt/engine.h"
2323
#include "paddle/fluid/inference/tensorrt/helper.h"
2424
#include "paddle/fluid/inference/tensorrt/op_teller.h"
25+
#include "paddle/fluid/inference/utils/io_utils.h"
2526

2627
namespace paddle {
2728
namespace inference {
@@ -197,6 +198,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
197198
auto opt_input_shape =
198199
Get<std::map<std::string, std::vector<int>>>("optim_input_shape");
199200

201+
auto allow_build_at_runtime = Get<bool>("trt_allow_build_at_runtime");
202+
auto shape_range_info_path = Get<std::string>("trt_shape_range_info_path");
203+
auto trt_tuned_dynamic_shape = Get<bool>("trt_tuned_dynamic_shape");
204+
int max_batch_size = Get<int>("max_batch_size");
205+
if (trt_tuned_dynamic_shape) {
206+
VLOG(1) << "trt dynamic_shape deserialize from " << shape_range_info_path;
207+
inference::DeserializeShapeRangeInfo(shape_range_info_path,
208+
&min_input_shape, &max_input_shape,
209+
&opt_input_shape);
210+
}
211+
200212
// The following procedure is used to rename all the intermediate
201213
// variables and the output variables of the subgraph.
202214
// Why we do this?
@@ -242,12 +254,14 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
242254

243255
op_desc->SetBlockAttr("sub_block", new_block);
244256
op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString());
245-
op_desc->SetAttr("max_batch_size", Get<int>("max_batch_size"));
257+
op_desc->SetAttr("max_batch_size", max_batch_size);
246258
op_desc->SetAttr("workspace_size", Get<int>("workspace_size"));
247259
op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id"));
248260
op_desc->SetAttr("output_name_mapping", output_mapping);
249261
op_desc->SetAttr("origin_output_dims", renamed_output_dims);
250262
op_desc->SetAttr("parameters", params);
263+
op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime);
264+
op_desc->SetAttr("shape_range_info_path", shape_range_info_path);
251265

252266
// we record all inputs' shapes in attr to check if they are consistent
253267
// with the real inputs' shapes retrieved from scope when trt runs.
@@ -259,19 +273,24 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
259273
}
260274

261275
auto use_static_engine = Get<bool>("use_static_engine");
276+
op_desc->SetAttr("use_static_engine", use_static_engine);
277+
if (use_static_engine)
278+
op_desc->SetAttr("model_opt_cache_dir",
279+
Get<std::string>("model_opt_cache_dir"));
280+
262281
// TODO(NHZlX)
263282
// There are models with the same structure but the different parameters,
264283
// when running in the 'use_serialize' mode, there is a bug.
265284
// serialization is affected by max_batch_size, but calibration is not.
266285
// So we use seperate engine keys in serialization and calibration.
267286
auto engine_key = GenerateEngineKey(
268287
input_names_with_id, output_names_with_id, std::to_string(0),
269-
std::to_string(Get<int>("max_batch_size")),
288+
std::to_string(max_batch_size),
270289
std::to_string(static_cast<int>(precision_mode)), false);
271-
auto calibration_engine_key = GenerateEngineKey(
272-
input_names_with_id, output_names_with_id, std::to_string(0),
273-
std::to_string(Get<int>("max_batch_size")),
274-
std::to_string(static_cast<int>(precision_mode)), true);
290+
auto calibration_engine_key =
291+
GenerateEngineKey(input_names_with_id, output_names_with_id,
292+
std::to_string(0), std::to_string(max_batch_size),
293+
std::to_string(static_cast<int>(precision_mode)), true);
275294
auto predictor_id = Get<int>("predictor_id");
276295

277296
// Get "" when there is no cached calibration table data.
@@ -345,11 +364,10 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
345364
bool disable_trt_plugin_fp16 = Get<bool>("disable_trt_plugin_fp16");
346365
tensorrt::TensorRTEngine *trt_engine =
347366
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
348-
.Create(engine_key + std::to_string(predictor_id),
349-
Get<int>("max_batch_size"), Get<int>("workspace_size"),
350-
precision_mode, calibrator.get(), Get<int>("gpu_device_id"),
351-
min_input_shape, max_input_shape, opt_input_shape,
352-
disable_trt_plugin_fp16);
367+
.Create(engine_key + std::to_string(predictor_id), max_batch_size,
368+
Get<int>("workspace_size"), precision_mode, calibrator.get(),
369+
Get<int>("gpu_device_id"), min_input_shape, max_input_shape,
370+
opt_input_shape, disable_trt_plugin_fp16);
353371
trt_engine->SetUseOSS(Get<bool>("use_oss"));
354372
trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
355373
trt_engine->SetDLACore(Get<int>("trt_dla_core"));

paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,17 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
5555
// We get all the vars from local_scope instead of the ProgramDesc.
5656
// Because there exists the case that new parameter variables are not added to
5757
// the program in the analysis pass.
58+
bool reserve_cpu_weights = false;
59+
if (argument->tensorrt_allow_build_at_runtime_valid() &&
60+
argument->tensorrt_allow_build_at_runtime()) {
61+
reserve_cpu_weights = true;
62+
}
5863
for (auto &var_name : all_vars) {
5964
if (std::count(repetitive_params.begin(), repetitive_params.end(),
6065
var_name)) {
61-
scope->EraseVars({var_name});
66+
if (!reserve_cpu_weights) {
67+
scope->EraseVars({var_name});
68+
}
6269
continue;
6370
}
6471
auto *var = scope->FindLocalVar(var_name);

paddle/fluid/inference/api/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
4949
endif()
5050

5151
cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps}
52-
zero_copy_tensor ir_pass_manager op_compatible_info)
52+
zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)
5353

5454
cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)
5555

paddle/fluid/inference/api/analysis_config.cc

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
158158
CP_MEMBER(trt_use_static_engine_);
159159
CP_MEMBER(trt_use_calib_mode_);
160160
CP_MEMBER(trt_use_oss_);
161+
CP_MEMBER(trt_tuned_dynamic_shape_);
162+
CP_MEMBER(trt_allow_build_at_runtime_);
163+
CP_MEMBER(collect_shape_range_info_);
164+
CP_MEMBER(shape_range_info_path_);
161165
// Dlnne related
162166
CP_MEMBER(use_dlnne_);
163167
CP_MEMBER(dlnne_min_subgraph_size_);
@@ -653,8 +657,8 @@ float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
653657
#endif
654658
}
655659

656-
void AnalysisConfig::EnableMemoryOptim() {
657-
enable_memory_optim_ = true;
660+
void AnalysisConfig::EnableMemoryOptim(bool x) {
661+
enable_memory_optim_ = x;
658662
Update();
659663
}
660664

@@ -783,6 +787,9 @@ std::string AnalysisConfig::Summary() {
783787
// dynamic_shape
784788
os.InsertRow({"tensorrt_enable_dynamic_shape",
785789
min_input_shape_.empty() ? "false" : "true"});
790+
os.InsertRow({"tensorrt_tuned_dynamic_shape", trt_tuned_dynamic_shape_
791+
? shape_range_info_path_
792+
: "false"});
786793

787794
os.InsertRow({"tensorrt_use_oss", trt_use_oss_ ? "true" : "false"});
788795
os.InsertRow({"tensorrt_use_dla", trt_use_dla_ ? "true" : "false"});
@@ -812,8 +819,46 @@ std::string AnalysisConfig::Summary() {
812819
os.InsertRow({"memory_optim", enable_memory_optim_ ? "true" : "false"});
813820
os.InsertRow({"enable_profile", with_profile_ ? "true" : "false"});
814821
os.InsertRow({"enable_log", with_glog_info_ ? "true" : "false"});
822+
os.InsertRow({"collect_shape_range_info",
823+
collect_shape_range_info_ ? shape_range_info_path_ : "false"});
815824

816825
return os.PrintTable();
817826
}
818827

828+
void AnalysisConfig::CollectShapeRangeInfo(
829+
const std::string &shape_range_info_path) {
830+
LOG(INFO) << "In CollectShapeInfo mode, we will disable optimizations and "
831+
"collect the shape information of "
832+
<< "all intermediate tensors in the compute graph and calculate "
833+
"the min_shape, max_shape and opt_shape.";
834+
collect_shape_range_info_ = true;
835+
PADDLE_ENFORCE_EQ(shape_range_info_path.empty(), false,
836+
platform::errors::InvalidArgument(
837+
"The shape_range_info_path should not be empty, please "
838+
"re-check the argument."));
839+
shape_range_info_path_ = shape_range_info_path;
840+
}
841+
842+
const std::string &AnalysisConfig::shape_range_info_path() {
843+
return shape_range_info_path_;
844+
}
845+
846+
bool AnalysisConfig::shape_range_info_collected() {
847+
return collect_shape_range_info_;
848+
}
849+
850+
void AnalysisConfig::EnableTunedTensorRtDynamicShape(
851+
const std::string &shape_range_info_path, bool allow_build_at_runtime) {
852+
shape_range_info_path_ = shape_range_info_path;
853+
trt_allow_build_at_runtime_ = allow_build_at_runtime;
854+
trt_tuned_dynamic_shape_ = true;
855+
}
856+
857+
bool AnalysisConfig::tuned_tensorrt_dynamic_shape() {
858+
return trt_tuned_dynamic_shape_;
859+
}
860+
861+
bool AnalysisConfig::trt_allow_build_at_runtime() {
862+
return trt_allow_build_at_runtime_;
863+
}
819864
} // namespace paddle

paddle/fluid/inference/api/analysis_predictor.cc

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,17 @@
1313
// limitations under the License.
1414

1515
#include "paddle/fluid/inference/api/analysis_predictor.h"
16+
1617
#include <glog/logging.h>
18+
1719
#include <algorithm>
1820
#include <fstream>
1921
#include <memory>
2022
#include <set>
2123
#include <string>
2224
#include <utility>
2325
#include <vector>
26+
2427
#include "paddle/fluid/extension/include/ext_op_meta_info.h"
2528
#include "paddle/fluid/framework/feed_fetch_method.h"
2629
#include "paddle/fluid/framework/feed_fetch_type.h"
@@ -34,6 +37,7 @@
3437
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
3538
#include "paddle/fluid/inference/api/helper.h"
3639
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
40+
#include "paddle/fluid/inference/utils/io_utils.h"
3741
#include "paddle/fluid/inference/utils/singleton.h"
3842
#include "paddle/fluid/memory/memcpy.h"
3943
#include "paddle/fluid/platform/cpu_helper.h"
@@ -570,6 +574,11 @@ void AnalysisPredictor::PrepareArgument() {
570574
argument_.SetMaxInputShape(config_.max_input_shape_);
571575
argument_.SetOptimInputShape(config_.optim_input_shape_);
572576
argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
577+
argument_.SetTensorRtShapeRangeInfoPath(config_.shape_range_info_path());
578+
argument_.SetTensorRtTunedDynamicShape(
579+
config_.tuned_tensorrt_dynamic_shape());
580+
argument_.SetTensorRtAllowBuildAtRuntime(
581+
config_.trt_allow_build_at_runtime());
573582
}
574583

575584
if (config_.dlnne_enabled()) {
@@ -915,6 +924,11 @@ bool AnalysisPredictor::ZeroCopyRun() {
915924
#endif
916925

917926
executor_->Run();
927+
928+
if (config_.shape_range_info_collected()) {
929+
CollectShapeRangeInfo();
930+
}
931+
918932
// Fix TensorArray reuse not cleaned bug.
919933
tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
920934
tensor_array_batch_cleaner_.ResetTensorArray();
@@ -934,6 +948,78 @@ bool AnalysisPredictor::ZeroCopyRun() {
934948
return true;
935949
}
936950

951+
void AnalysisPredictor::CollectShapeRangeInfo() {
952+
// if use gpu, sync first.
953+
if (config_.use_gpu()) {
954+
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
955+
paddle::platform::DeviceContextPool &pool =
956+
paddle::platform::DeviceContextPool::Instance();
957+
auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, place_);
958+
auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
959+
pool.Get(gpu_place));
960+
#ifdef PADDLE_WITH_HIP
961+
hipStreamSynchronize(dev_ctx->stream());
962+
#else
963+
cudaStreamSynchronize(dev_ctx->stream());
964+
#endif
965+
#endif
966+
}
967+
968+
std::vector<std::string> var_names = sub_scope_->LocalVarNames();
969+
for (const auto &name : var_names) {
970+
auto *var = sub_scope_->GetVar(name);
971+
if (!var->IsType<framework::LoDTensor>()) {
972+
continue;
973+
}
974+
framework::DDim dim = var->Get<framework::LoDTensor>().dims();
975+
std::vector<int32_t> shape(dim.size());
976+
for (size_t i = 0; i < shape.size(); ++i) shape[i] = dim[i];
977+
shape_info_[name].emplace_back(shape);
978+
}
979+
}
980+
981+
void AnalysisPredictor::StatisticShapeRangeInfo() {
982+
std::map<std::string, std::vector<int32_t>> min_shapes;
983+
std::map<std::string, std::vector<int32_t>> max_shapes;
984+
std::map<std::string, std::vector<int32_t>> opt_shapes;
985+
for (auto it : shape_info_) {
986+
auto name = it.first;
987+
auto shapes = it.second;
988+
989+
std::vector<int32_t> min_shape(shapes[0].begin(), shapes[0].end());
990+
std::vector<int32_t> max_shape(shapes[0].begin(), shapes[0].end());
991+
std::vector<int32_t> opt_shape(shapes[0].begin(), shapes[0].end());
992+
993+
auto ShapeMaxFreq = [](const std::map<int32_t, int32_t> &m) -> int32_t {
994+
std::vector<std::pair<int32_t, int32_t>> counter;
995+
for (auto &it : m) counter.push_back(it);
996+
std::sort(
997+
counter.begin(), counter.end(),
998+
[](std::pair<int32_t, int32_t> &a, std::pair<int32_t, int32_t> &b) {
999+
return a.second > b.second;
1000+
});
1001+
return counter[0].first;
1002+
};
1003+
1004+
for (size_t d = 0; d < shapes[0].size(); ++d) {
1005+
std::map<int32_t, int32_t> counter;
1006+
for (size_t i = 0; i < shapes.size(); ++i) {
1007+
counter[shapes[i][d]] += 1;
1008+
if (shapes[i][d] < min_shape[d]) min_shape[d] = shapes[i][d];
1009+
if (shapes[i][d] > max_shape[d]) max_shape[d] = shapes[i][d];
1010+
}
1011+
opt_shape[d] = ShapeMaxFreq(counter);
1012+
}
1013+
1014+
min_shapes[name] = min_shape;
1015+
max_shapes[name] = max_shape;
1016+
opt_shapes[name] = opt_shape;
1017+
}
1018+
1019+
inference::SerializeShapeRangeInfo(config_.shape_range_info_path(),
1020+
min_shapes, max_shapes, opt_shapes);
1021+
}
1022+
9371023
bool AnalysisPredictor::LoadProgramDesc() {
9381024
// Initialize the inference program
9391025
std::string filename;
@@ -1140,6 +1226,10 @@ AnalysisPredictor::~AnalysisPredictor() {
11401226
}
11411227
#endif
11421228

1229+
if (config_.shape_range_info_collected()) {
1230+
StatisticShapeRangeInfo();
1231+
}
1232+
11431233
memory::Release(place_);
11441234
}
11451235

0 commit comments

Comments
 (0)