@@ -404,7 +404,10 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
404404 struct ggml_tensor * output,
405405 int x,
406406 int y,
407- int overlap) {
407+ int overlap_x,
408+ int overlap_y,
409+ int x_skip = 0 ,
410+ int y_skip = 0 ) {
408411 int64_t width = input->ne [0 ];
409412 int64_t height = input->ne [1 ];
410413 int64_t channels = input->ne [2 ];
@@ -413,17 +416,17 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
413416 int64_t img_height = output->ne [1 ];
414417
415418 GGML_ASSERT (input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
416- for (int iy = 0 ; iy < height; iy++) {
417- for (int ix = 0 ; ix < width; ix++) {
419+ for (int iy = y_skip ; iy < height; iy++) {
420+ for (int ix = x_skip ; ix < width; ix++) {
418421 for (int k = 0 ; k < channels; k++) {
419422 float new_value = ggml_tensor_get_f32 (input, ix, iy, k);
420- if (overlap > 0 ) { // blend colors in overlapped area
423+ if (overlap_x > 0 || overlap_y > 0 ) { // blend colors in overlapped area
421424 float old_value = ggml_tensor_get_f32 (output, x + ix, y + iy, k);
422425
423- const float x_f_0 = (x > 0 ) ? ix / float (overlap ) : 1 ;
424- const float x_f_1 = (x < (img_width - width)) ? (width - ix) / float (overlap ) : 1 ;
425- const float y_f_0 = (y > 0 ) ? iy / float (overlap ) : 1 ;
426- const float y_f_1 = (y < (img_height - height)) ? (height - iy) / float (overlap ) : 1 ;
426+ const float x_f_0 = (overlap_x > 0 && x > 0 ) ? ( ix - x_skip) / float (overlap_x ) : 1 ;
427+ const float x_f_1 = (overlap_x > 0 && x < (img_width - width)) ? (width - ix) / float (overlap_x ) : 1 ;
428+ const float y_f_0 = (overlap_y > 0 && y > 0 ) ? ( iy - y_skip) / float (overlap_y ) : 1 ;
429+ const float y_f_1 = (overlap_y > 0 && y < (img_height - height)) ? (height - iy) / float (overlap_y ) : 1 ;
427430
428431 const float x_f = std::min (std::min (x_f_0, x_f_1), 1 .f );
429432 const float y_f = std::min (std::min (y_f_0, y_f_1), 1 .f );
@@ -537,19 +540,77 @@ __STATIC_INLINE__ void ggml_tensor_scale_output(struct ggml_tensor* src) {
537540typedef std::function<void (ggml_tensor*, ggml_tensor*, bool )> on_tile_process;
538541
539542// Tiling
540- __STATIC_INLINE__ void sd_tiling (ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
543+ __STATIC_INLINE__ void sd_tiling (ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing, bool scaled_out = true ) {
541544 int input_width = (int )input->ne [0 ];
542545 int input_height = (int )input->ne [1 ];
543546 int output_width = (int )output->ne [0 ];
544547 int output_height = (int )output->ne [1 ];
548+
549+ int input_tile_size, output_tile_size;
550+ if (scaled_out) {
551+ input_tile_size = tile_size;
552+ output_tile_size = tile_size * scale;
553+ } else {
554+ input_tile_size = tile_size * scale;
555+ output_tile_size = tile_size;
556+ }
557+ int tile_overlap = (input_tile_size * tile_overlap_factor);
558+ int non_tile_overlap = input_tile_size - tile_overlap;
559+
560+ int num_tiles_x = (input_width - tile_overlap) / non_tile_overlap;
561+ int overshoot_x = ((num_tiles_x + 1 ) * non_tile_overlap + tile_overlap) % input_width;
562+
563+ if ((overshoot_x != non_tile_overlap) && (overshoot_x <= num_tiles_x * (input_tile_size / 2 - tile_overlap))) {
564+ // if tiles don't fit perfectly using the desired overlap
565+ // and there is enough room to squeeze an extra tile without overlap becoming >0.5
566+ num_tiles_x++;
567+ }
568+
569+ float tile_overlap_factor_x = (float )(input_tile_size * num_tiles_x - input_width) / (float )(input_tile_size * (num_tiles_x - 1 ));
570+ if (num_tiles_x <= 2 ) {
571+ if (input_width == input_tile_size) {
572+ num_tiles_x = 1 ;
573+ tile_overlap_factor_x = 0 ;
574+ } else {
575+ num_tiles_x = 2 ;
576+ tile_overlap_factor_x = (2 * input_tile_size - input_width) / (float )input_tile_size;
577+ }
578+ }
579+
580+ int num_tiles_y = (input_height - tile_overlap) / non_tile_overlap;
581+ int overshoot_y = ((num_tiles_y + 1 ) * non_tile_overlap + tile_overlap) % input_height;
582+
583+ if ((overshoot_y != non_tile_overlap) && (overshoot_y <= num_tiles_y * (input_tile_size / 2 - tile_overlap))) {
584+ // if tiles don't fit perfectly using the desired overlap
585+ // and there is enough room to squeeze an extra tile without overlap becoming >0.5
586+ num_tiles_y++;
587+ }
588+
589+ float tile_overlap_factor_y = (float )(input_tile_size * num_tiles_y - input_height) / (float )(input_tile_size * (num_tiles_y - 1 ));
590+ if (num_tiles_y <= 2 ) {
591+ if (input_height == input_tile_size) {
592+ num_tiles_y = 1 ;
593+ tile_overlap_factor_y = 0 ;
594+ } else {
595+ num_tiles_y = 2 ;
596+ tile_overlap_factor_y = (2 * input_tile_size - input_height) / (float )input_tile_size;
597+ }
598+ }
599+
600+ LOG_DEBUG (" num tiles : %d, %d " , num_tiles_x, num_tiles_y);
601+ LOG_DEBUG (" optimal overlap : %f, %f (targeting %f)" , tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor);
602+
545603 GGML_ASSERT (input_width % 2 == 0 && input_height % 2 == 0 && output_width % 2 == 0 && output_height % 2 == 0 ); // should be multiple of 2
546604
547- int tile_overlap = (int32_t )(tile_size * tile_overlap_factor);
548- int non_tile_overlap = tile_size - tile_overlap;
605+ int tile_overlap_x = (int32_t )(input_tile_size * tile_overlap_factor_x);
606+ int non_tile_overlap_x = input_tile_size - tile_overlap_x;
607+
608+ int tile_overlap_y = (int32_t )(input_tile_size * tile_overlap_factor_y);
609+ int non_tile_overlap_y = input_tile_size - tile_overlap_y;
549610
550611 struct ggml_init_params params = {};
551- params.mem_size += tile_size * tile_size * input->ne [2 ] * sizeof (float ); // input chunk
552- params.mem_size += (tile_size * scale) * (tile_size * scale) * output->ne [2 ] * sizeof (float ); // output chunk
612+ params.mem_size += input_tile_size * input_tile_size * input->ne [2 ] * sizeof (float ); // input chunk
613+ params.mem_size += output_tile_size * output_tile_size * output->ne [2 ] * sizeof (float ); // output chunk
553614 params.mem_size += 3 * ggml_tensor_overhead ();
554615 params.mem_buffer = NULL ;
555616 params.no_alloc = false ;
@@ -564,29 +625,39 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
564625 }
565626
566627 // tiling
567- ggml_tensor* input_tile = ggml_new_tensor_4d (tiles_ctx, GGML_TYPE_F32, tile_size, tile_size , input->ne [2 ], 1 );
568- ggml_tensor* output_tile = ggml_new_tensor_4d (tiles_ctx, GGML_TYPE_F32, tile_size * scale, tile_size * scale , output->ne [2 ], 1 );
628+ ggml_tensor* input_tile = ggml_new_tensor_4d (tiles_ctx, GGML_TYPE_F32, input_tile_size, input_tile_size , input->ne [2 ], 1 );
629+ ggml_tensor* output_tile = ggml_new_tensor_4d (tiles_ctx, GGML_TYPE_F32, output_tile_size, output_tile_size , output->ne [2 ], 1 );
569630 on_processing (input_tile, NULL , true );
570- int num_tiles = ceil (( float )input_width / non_tile_overlap) * ceil (( float )input_height / non_tile_overlap) ;
631+ int num_tiles = num_tiles_x * num_tiles_y ;
571632 LOG_INFO (" processing %i tiles" , num_tiles);
572633 pretty_progress (1 , num_tiles, 0 .0f );
573634 int tile_count = 1 ;
574635 bool last_y = false , last_x = false ;
575636 float last_time = 0 .0f ;
576- for (int y = 0 ; y < input_height && !last_y; y += non_tile_overlap) {
577- if (y + tile_size >= input_height) {
578- y = input_height - tile_size;
637+ for (int y = 0 ; y < input_height && !last_y; y += non_tile_overlap_y) {
638+ int dy = 0 ;
639+ if (y + input_tile_size >= input_height) {
640+ int _y = y;
641+ y = input_height - input_tile_size;
642+ dy = _y - y;
579643 last_y = true ;
580644 }
581- for (int x = 0 ; x < input_width && !last_x; x += non_tile_overlap) {
582- if (x + tile_size >= input_width) {
583- x = input_width - tile_size;
645+ for (int x = 0 ; x < input_width && !last_x; x += non_tile_overlap_x) {
646+ int dx = 0 ;
647+ if (x + input_tile_size >= input_width) {
648+ int _x = x;
649+ x = input_width - input_tile_size;
650+ dx = _x - x;
584651 last_x = true ;
585652 }
586653 int64_t t1 = ggml_time_ms ();
587654 ggml_split_tensor_2d (input, input_tile, x, y);
588655 on_processing (input_tile, output_tile, false );
589- ggml_merge_tensor_2d (output_tile, output, x * scale, y * scale, tile_overlap * scale);
656+ if (scaled_out) {
657+ ggml_merge_tensor_2d (output_tile, output, x * scale, y * scale, tile_overlap_x * scale, tile_overlap_y * scale, dx * scale, dy * scale);
658+ } else {
659+ ggml_merge_tensor_2d (output_tile, output, x / scale, y / scale, tile_overlap_x / scale, tile_overlap_y / scale, dx / scale, dy / scale);
660+ }
590661 int64_t t2 = ggml_time_ms ();
591662 last_time = (t2 - t1) / 1000 .0f ;
592663 pretty_progress (tile_count, num_tiles, last_time);
0 commit comments