|
1 | 1 | # /// script |
2 | 2 | # dependencies = [ |
3 | | -# "optimum-benchmark[openvino]==0.7", |
4 | | -# "transformers==4.53", |
| 3 | +# "optimum-benchmark[openvino]@git+https://github.com/huggingface/optimum-benchmark.git@main", |
| 4 | +# "optimum-intel@git+https://github.com/huggingface/optimum-intel.git@main", |
| 5 | +# "transformers==4.55", |
5 | 6 | # "torchvision", |
6 | 7 | # "num2words", |
7 | 8 | # ] |
8 | 9 | # /// |
9 | 10 |
|
10 | | -from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, OpenVINOConfig, ProcessConfig, PyTorchConfig |
11 | | -from optimum_benchmark.logging_utils import setup_logging |
12 | | - |
13 | | -setup_logging(level="INFO", to_file=True, prefix="OPTIMUM-BENCHMARK") |
| 11 | +import matplotlib.pyplot as plt |
14 | 12 |
|
| 13 | +from optimum_benchmark import ( |
| 14 | + Benchmark, |
| 15 | + BenchmarkConfig, |
| 16 | + BenchmarkReport, |
| 17 | + InferenceConfig, |
| 18 | + OpenVINOConfig, |
| 19 | + ProcessConfig, |
| 20 | + PyTorchConfig, |
| 21 | +) |
| 22 | +from optimum_benchmark.logging_utils import setup_logging |
15 | 23 |
|
16 | 24 | if __name__ == "__main__": |
| 25 | + setup_logging(level="INFO", to_file=True, prefix="OPTIMUM-BENCHMARK") |
| 26 | + |
17 | 27 | launcher_config = ProcessConfig() |
18 | 28 | scenario_config = InferenceConfig( |
| 29 | + memory=True, |
19 | 30 | latency=True, |
20 | | - input_shapes={"batch_size": 1, "sequence_length": 16, "num_images": 1}, |
21 | 31 | generate_kwargs={"max_new_tokens": 16, "min_new_tokens": 16}, |
| 32 | + input_shapes={"batch_size": 1, "sequence_length": 16, "num_images": 1}, |
22 | 33 | ) |
23 | 34 |
|
24 | 35 | model = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" |
|
29 | 40 | device="cpu", |
30 | 41 | model=model, |
31 | 42 | no_weights=True, |
32 | | - quantization_config={"bits": 8, "weight_only": True, "num_samples": 1}, |
| 43 | + quantization_config={"bits": 8, "num_samples": 1, "weight_only": True}, |
33 | 44 | ), |
34 | 45 | "openvino-8bit-static": OpenVINOConfig( |
35 | 46 | device="cpu", |
36 | 47 | model=model, |
37 | 48 | no_weights=True, |
38 | | - quantization_config={"n_bits": 8, "weight_only": False, "num_samples": 1}, |
| 49 | + quantization_config={"bits": 8, "num_samples": 1, "dataset": "contextual"}, |
39 | 50 | ), |
40 | 51 | } |
41 | 52 |
|
42 | | - results = {} |
43 | 53 | for config_name, backend_config in backend_configs.items(): |
44 | 54 | benchmark_config = BenchmarkConfig( |
45 | 55 | name=f"{config_name}", |
|
48 | 58 | backend=backend_config, |
49 | 59 | ) |
50 | 60 | benchmark_report = Benchmark.launch(benchmark_config) |
51 | | - benchmark_report.save_json(f"{config_name}_vlm_benchmark_report.json") |
52 | | - results[config_name] = benchmark_report |
53 | | - |
54 | | - for config_name, benchmark_report in results.items(): |
55 | | - print("-" * 80) |
56 | | - print(f"Results for {config_name}:") |
57 | | - print("- Prefill Metrics:") # prefill = the processing of the input (text + image) to produce the first token |
58 | | - benchmark_report.prefill.log() |
59 | | - print("- Decode Metrics:") # decode = the processing of subsequent tokens |
60 | | - benchmark_report.decode.log() |
| 61 | + # benchmark_report.to_json(f"{config_name}_report.json") |
| 62 | + benchmark_report.push_to_hub(repo_id="IlyasMoutawwakil/vlm_benchmark", filename=f"{config_name}_report") |
| 63 | + |
| 64 | + backend_reports = {} |
| 65 | + for config_name in backend_configs.keys(): |
| 66 | + # backend_reports[config_name] = BenchmarkReport.from_json(f"{config_name}_report.json") |
| 67 | + backend_reports[config_name] = BenchmarkReport.from_hub( |
| 68 | + repo_id="IlyasMoutawwakil/vlm_benchmark", filename=f"{config_name}_report" |
| 69 | + ) |
| 70 | + |
| 71 | + _, ax = plt.subplots() |
| 72 | + ax.boxplot( |
| 73 | + [backend_reports[config_name].prefill.latency.values for config_name in backend_reports.keys()], |
| 74 | + tick_labels=backend_reports.keys(), |
| 75 | + showfliers=False, |
| 76 | + ) |
| 77 | + plt.xticks(rotation=10) |
| 78 | + ax.set_ylabel("Latency (s)") |
| 79 | + ax.set_xlabel("Configurations") |
| 80 | + ax.set_title("Prefill Latencies") |
| 81 | + plt.savefig("prefill_latencies_boxplot.png") |
| 82 | + |
| 83 | + _, ax = plt.subplots() |
| 84 | + ax.boxplot( |
| 85 | + [backend_reports[config_name].per_token.latency.values for config_name in backend_reports.keys()], |
| 86 | + tick_labels=backend_reports.keys(), |
| 87 | + showfliers=False, |
| 88 | + ) |
| 89 | + plt.xticks(rotation=10) |
| 90 | + ax.set_ylabel("Latency (s)") |
| 91 | + ax.set_xlabel("Configurations") |
| 92 | + ax.set_title("Per-token Latencies") |
| 93 | + plt.savefig("per_token_latencies_boxplot.png") |
| 94 | + |
| 95 | + _, ax = plt.subplots() |
| 96 | + ax.bar( |
| 97 | + list(backend_reports.keys()), |
| 98 | + [backend_reports[config_name].generate.memory.max_ram for config_name in backend_reports.keys()], |
| 99 | + color=["C0", "C1", "C2", "C3", "C4", "C5"], |
| 100 | + ) |
| 101 | + plt.xticks(rotation=10) |
| 102 | + ax.set_title("Max RAM") |
| 103 | + ax.set_ylabel("RAM (MB)") |
| 104 | + ax.set_xlabel("Configurations") |
| 105 | + plt.savefig("max_ram_barplot.png") |
0 commit comments