@@ -5016,7 +5016,7 @@ static std::vector<std::pair<int, int>> ds_build_target_ratios(const int min_num
50165016 return ratios;
50175017}
50185018
5019- static std::pair<int , int > ds_find_closest_aspect_ratio (
5019+ static std::pair<int , int > ds_find_closest_ratio (
50205020 const float aspect_ratio,
50215021 const std::vector<std::pair<int , int >> &target_ratios,
50225022 const int width,
@@ -5382,60 +5382,53 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
53825382 /* Dynamic Resolution (Gundam/Gundam-Master) */
53835383
53845384 // configurable, or read from params
5385- const int min_num = 2 ;
5386- const int max_num = 9 ;
5387- const int image_size = params.image_size ; // typically 640
5388- // const bool use_thumbnail = true; // mimic python's use_thumbnail
5389-
5385+ const int min_num = 2 ;
5386+ const int max_num = 9 ;
5387+ const int image_size = (mode_i == 4 ) ? 640 : 1024 ;
5388+
53905389 // original image size
5391- const int orig_w = original_size.width ;
5392- const int orig_h = original_size.height ;
5390+ const int orig_w = original_size.width ;
5391+ const int orig_h = original_size.height ;
53935392
5394- // 1) build candidate grids (cols, rows)
5393+ // create overview image (thumbnail)
5394+ clip_image_u8_ptr overview_img (clip_image_u8_init ());
5395+ img_tool::resize (*img, *overview_img, { image_size, image_size },
5396+ img_tool::RESIZE_ALGO_BICUBIC_PILLOW, true , color);
5397+ clip_image_f32_ptr overview_f32 (clip_image_f32_init ());
5398+ normalize_image_u8_to_f32 (*overview_img, *overview_f32, params.image_mean , params.image_std );
5399+ res_imgs->entries .push_back (std::move (overview_f32));
5400+
5401+ // build candidate grids (cols, rows)
53955402 auto target_ratios = ds_build_target_ratios (min_num, max_num);
53965403
5397- // 2) pick the grid that best matches the original aspect ratio
5404+ // pick the grid that best matches the original aspect ratio
53985405 const float aspect_ratio = static_cast <float >(orig_w) / static_cast <float >(orig_h);
5399- auto best = ds_find_closest_aspect_ratio (aspect_ratio, target_ratios, orig_w, orig_h, image_size);
5406+ auto best = ds_find_closest_ratio (aspect_ratio, target_ratios, orig_w, orig_h, image_size);
54005407 const int grid_cols = best.first ; // how many tiles horizontally
54015408 const int grid_rows = best.second ; // how many tiles vertically
5402-
5403- // 3) compute the target (forced) size — python did:
5404- // target_width = image_size * cols
5405- // target_height = image_size * rows
5406- const clip_image_size refined_size{ image_size * grid_cols, image_size * grid_rows };
5407-
5408- // 4) prepare slice instructions, same style as the idefics3 branch
5409- llava_uhd::slice_instructions instructions;
5410- instructions.overview_size = clip_image_size{ image_size, image_size }; // for thumbnail/global
5411- instructions.refined_size = refined_size;
5412- instructions.grid_size = clip_image_size{ grid_cols, grid_rows };
5413-
5414- // in deepseek python they always produce *full* 640x640 blocks,
5415- // so we can do a simple double loop over rows/cols:
5409+
5410+ // resize to refined size (no padding, direct resize)
5411+ clip_image_u8_ptr refined_img (clip_image_u8_init ());
5412+ img_tool::resize (*img, *refined_img, { image_size * grid_cols, image_size * grid_rows },
5413+ img_tool::RESIZE_ALGO_BICUBIC_PILLOW, false );
5414+
5415+ // crop slices from the refined image
54165416 for (int r = 0 ; r < grid_rows; ++r) {
54175417 for (int c = 0 ; c < grid_cols; ++c) {
54185418 const int x = c * image_size;
54195419 const int y = r * image_size;
5420-
5421- instructions.slices .push_back (llava_uhd::slice_coordinates{
5422- /* x */ x,
5423- /* y */ y,
5424- /* size */ clip_image_size{ image_size, image_size }
5425- });
5420+
5421+ // crop the slice
5422+ clip_image_u8_ptr slice_img (clip_image_u8_init ());
5423+ img_tool::crop (*refined_img, *slice_img, x, y, image_size, image_size);
5424+
5425+ // normalize and add to results
5426+ clip_image_f32_ptr slice_f32 (clip_image_f32_init ());
5427+ normalize_image_u8_to_f32 (*slice_img, *slice_f32, params.image_mean , params.image_std );
5428+ res_imgs->entries .push_back (std::move (slice_f32));
54265429 }
54275430 }
5428-
5429- // 5) run the actual slicing (this should: resize to refined_size, then crop every slice)
5430- auto imgs = llava_uhd::slice_image (img, instructions);
5431-
5432- // 7) cast & normalize like the idefics3 branch
5433- for (size_t i = 0 ; i < imgs.size (); ++i) {
5434- clip_image_f32_ptr res (clip_image_f32_init ());
5435- normalize_image_u8_to_f32 (*imgs[i], *res, params.image_mean , params.image_std );
5436- res_imgs->entries .push_back (std::move (res));
5437- }
5438-
5431+
54395432 // keep the grid info — the model may need to know how to reassemble / attend
54405433 res_imgs->grid_x = grid_cols;
54415434 res_imgs->grid_y = grid_rows;
@@ -5971,8 +5964,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
59715964 // do nothing
59725965 } break ;
59735966 case PROJECTOR_TYPE_DEEPSEEKOCR:
5974- {
5975- } break ;
5967+ {
5968+ } break ;
59765969 case PROJECTOR_TYPE_LLAMA4:
59775970 {
59785971 // set the 2D positions
0 commit comments