@@ -367,6 +367,144 @@ static int find_exact_renames(struct diff_options *options)
367367 return renames ;
368368}
369369
370+ static const char * get_basename (const char * filename )
371+ {
372+ /*
373+ * gitbasename() has to worry about special drives, multiple
374+ * directory separator characters, trailing slashes, NULL or
375+ * empty strings, etc. We only work on filenames as stored in
376+ * git, and thus get to ignore all those complications.
377+ */
378+ const char * base = strrchr (filename , '/' );
379+ return base ? base + 1 : filename ;
380+ }
381+
382+ static int find_basename_matches (struct diff_options * options ,
383+ int minimum_score )
384+ {
385+ /*
386+ * When I checked in early 2020, over 76% of file renames in linux
387+ * just moved files to a different directory but kept the same
388+ * basename. gcc did that with over 64% of renames, gecko did it
389+ * with over 79%, and WebKit did it with over 89%.
390+ *
391+ * Therefore we can bypass the normal exhaustive NxM matrix
392+ * comparison of similarities between all potential rename sources
393+ * and destinations by instead using file basename as a hint (i.e.
394+ * the portion of the filename after the last '/'), checking for
395+ * similarity between files with the same basename, and if we find
396+ * a pair that are sufficiently similar, record the rename pair and
397+ * exclude those two from the NxM matrix.
398+ *
399+ * This *might* cause us to find a less than optimal pairing (if
400+ * there is another file that we are even more similar to but has a
401+ * different basename). Given the huge performance advantage
402+ * basename matching provides, and given the frequency with which
403+ * people use the same basename in real world projects, that's a
404+ * trade-off we are willing to accept when doing just rename
405+ * detection.
406+ *
407+ * If someone wants copy detection that implies they are willing to
408+ * spend more cycles to find similarities between files, so it may
409+ * be less likely that this heuristic is wanted. If someone is
410+ * doing break detection, that means they do not want filename
411+ * similarity to imply any form of content similiarity, and thus
412+ * this heuristic would definitely be incompatible.
413+ */
414+
415+ int i , renames = 0 ;
416+ struct strintmap sources ;
417+ struct strintmap dests ;
418+ struct hashmap_iter iter ;
419+ struct strmap_entry * entry ;
420+
421+ /*
422+ * The prefeteching stuff wants to know if it can skip prefetching
423+ * blobs that are unmodified...and will then do a little extra work
424+ * to verify that the oids are indeed different before prefetching.
425+ * Unmodified blobs are only relevant when doing copy detection;
426+ * when limiting to rename detection, diffcore_rename[_extended]()
427+ * will never be called with unmodified source paths fed to us, so
428+ * the extra work necessary to check if rename_src entries are
429+ * unmodified would be a small waste.
430+ */
431+ int skip_unmodified = 0 ;
432+
433+ /*
434+ * Create maps of basename -> fullname(s) for remaining sources and
435+ * dests.
436+ */
437+ strintmap_init_with_options (& sources , -1 , NULL , 0 );
438+ strintmap_init_with_options (& dests , -1 , NULL , 0 );
439+ for (i = 0 ; i < rename_src_nr ; ++ i ) {
440+ char * filename = rename_src [i ].p -> one -> path ;
441+ const char * base ;
442+
443+ /* exact renames removed in remove_unneeded_paths_from_src() */
444+ assert (!rename_src [i ].p -> one -> rename_used );
445+
446+ /* Record index within rename_src (i) if basename is unique */
447+ base = get_basename (filename );
448+ if (strintmap_contains (& sources , base ))
449+ strintmap_set (& sources , base , -1 );
450+ else
451+ strintmap_set (& sources , base , i );
452+ }
453+ for (i = 0 ; i < rename_dst_nr ; ++ i ) {
454+ char * filename = rename_dst [i ].p -> two -> path ;
455+ const char * base ;
456+
457+ if (rename_dst [i ].is_rename )
458+ continue ; /* involved in exact match already. */
459+
460+ /* Record index within rename_dst (i) if basename is unique */
461+ base = get_basename (filename );
462+ if (strintmap_contains (& dests , base ))
463+ strintmap_set (& dests , base , -1 );
464+ else
465+ strintmap_set (& dests , base , i );
466+ }
467+
468+ /* Now look for basename matchups and do similarity estimation */
469+ strintmap_for_each_entry (& sources , & iter , entry ) {
470+ const char * base = entry -> key ;
471+ intptr_t src_index = (intptr_t )entry -> value ;
472+ intptr_t dst_index ;
473+ if (src_index == -1 )
474+ continue ;
475+
476+ if (0 <= (dst_index = strintmap_get (& dests , base ))) {
477+ struct diff_filespec * one , * two ;
478+ int score ;
479+
480+ /* Estimate the similarity */
481+ one = rename_src [src_index ].p -> one ;
482+ two = rename_dst [dst_index ].p -> two ;
483+ score = estimate_similarity (options -> repo , one , two ,
484+ minimum_score , skip_unmodified );
485+
486+ /* If sufficiently similar, record as rename pair */
487+ if (score < minimum_score )
488+ continue ;
489+ record_rename_pair (dst_index , src_index , score );
490+ renames ++ ;
491+
492+ /*
493+ * Found a rename so don't need text anymore; if we
494+ * didn't find a rename, the filespec_blob would get
495+ * re-used when doing the matrix of comparisons.
496+ */
497+ diff_free_filespec_blob (one );
498+ diff_free_filespec_blob (two );
499+ }
500+ }
501+
502+ strintmap_clear (& sources );
503+ strintmap_clear (& dests );
504+
505+ return renames ;
506+ }
507+
370508#define NUM_CANDIDATE_PER_DST 4
371509static void record_if_better (struct diff_score m [], struct diff_score * o )
372510{
@@ -454,6 +592,54 @@ static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, i
454592 return count ;
455593}
456594
595+ static void remove_unneeded_paths_from_src (int detecting_copies )
596+ {
597+ int i , new_num_src ;
598+
599+ if (detecting_copies )
600+ return ; /* nothing to remove */
601+ if (break_idx )
602+ return ; /* culling incompatible with break detection */
603+
604+ /*
605+ * Note on reasons why we cull unneeded sources but not destinations:
606+ * 1) Pairings are stored in rename_dst (not rename_src), which we
607+ * need to keep around. So, we just can't cull rename_dst even
608+ * if we wanted to. But doing so wouldn't help because...
609+ *
610+ * 2) There is a matrix pairwise comparison that follows the
611+ * "Performing inexact rename detection" progress message.
612+ * Iterating over the destinations is done in the outer loop,
613+ * hence we only iterate over each of those once and we can
614+ * easily skip the outer loop early if the destination isn't
615+ * relevant. That's only one check per destination path to
616+ * skip.
617+ *
618+ * By contrast, the sources are iterated in the inner loop; if
619+ * we check whether a source can be skipped, then we'll be
620+ * checking it N separate times, once for each destination.
621+ * We don't want to have to iterate over known-not-needed
622+ * sources N times each, so avoid that by removing the sources
623+ * from rename_src here.
624+ */
625+ for (i = 0 , new_num_src = 0 ; i < rename_src_nr ; i ++ ) {
626+ /*
627+ * renames are stored in rename_dst, so if a rename has
628+ * already been detected using this source, we can just
629+ * remove the source knowing rename_dst has its info.
630+ */
631+ if (rename_src [i ].p -> one -> rename_used )
632+ continue ;
633+
634+ if (new_num_src < i )
635+ memcpy (& rename_src [new_num_src ], & rename_src [i ],
636+ sizeof (struct diff_rename_src ));
637+ new_num_src ++ ;
638+ }
639+
640+ rename_src_nr = new_num_src ;
641+ }
642+
457643void diffcore_rename (struct diff_options * options )
458644{
459645 int detect_rename = options -> detect_rename ;
@@ -463,9 +649,11 @@ void diffcore_rename(struct diff_options *options)
463649 struct diff_score * mx ;
464650 int i , j , rename_count , skip_unmodified = 0 ;
465651 int num_destinations , dst_cnt ;
652+ int num_sources , want_copies ;
466653 struct progress * progress = NULL ;
467654
468655 trace2_region_enter ("diff" , "setup" , options -> repo );
656+ want_copies = (detect_rename == DIFF_DETECT_COPY );
469657 if (!minimum_score )
470658 minimum_score = DEFAULT_RENAME_SCORE ;
471659
@@ -502,7 +690,7 @@ void diffcore_rename(struct diff_options *options)
502690 p -> one -> rename_used ++ ;
503691 register_rename_src (p );
504692 }
505- else if (detect_rename == DIFF_DETECT_COPY ) {
693+ else if (want_copies ) {
506694 /*
507695 * Increment the "rename_used" score by
508696 * one, to indicate ourselves as a user.
@@ -527,17 +715,60 @@ void diffcore_rename(struct diff_options *options)
527715 if (minimum_score == MAX_SCORE )
528716 goto cleanup ;
529717
530- /*
531- * Calculate how many renames are left (but all the source
532- * files still remain as options for rename/copies!)
533- */
718+ num_sources = rename_src_nr ;
719+
720+ if (want_copies || break_idx ) {
721+ /*
722+ * Cull sources:
723+ * - remove ones corresponding to exact renames
724+ */
725+ trace2_region_enter ("diff" , "cull after exact" , options -> repo );
726+ remove_unneeded_paths_from_src (want_copies );
727+ trace2_region_leave ("diff" , "cull after exact" , options -> repo );
728+ } else {
729+ /* Determine minimum score to match basenames */
730+ double factor = 0.5 ;
731+ char * basename_factor = getenv ("GIT_BASENAME_FACTOR" );
732+ int min_basename_score ;
733+
734+ if (basename_factor )
735+ factor = strtol (basename_factor , NULL , 10 )/100.0 ;
736+ assert (factor >= 0.0 && factor <= 1.0 );
737+ min_basename_score = minimum_score +
738+ (int )(factor * (MAX_SCORE - minimum_score ));
739+
740+ /*
741+ * Cull sources:
742+ * - remove ones involved in renames (found via exact match)
743+ */
744+ trace2_region_enter ("diff" , "cull after exact" , options -> repo );
745+ remove_unneeded_paths_from_src (want_copies );
746+ trace2_region_leave ("diff" , "cull after exact" , options -> repo );
747+
748+ /* Utilize file basenames to quickly find renames. */
749+ trace2_region_enter ("diff" , "basename matches" , options -> repo );
750+ rename_count += find_basename_matches (options ,
751+ min_basename_score );
752+ trace2_region_leave ("diff" , "basename matches" , options -> repo );
753+
754+ /*
755+ * Cull sources, again:
756+ * - remove ones involved in renames (found via basenames)
757+ */
758+ trace2_region_enter ("diff" , "cull basename" , options -> repo );
759+ remove_unneeded_paths_from_src (want_copies );
760+ trace2_region_leave ("diff" , "cull basename" , options -> repo );
761+ }
762+
763+ /* Calculate how many rename destinations are left */
534764 num_destinations = (rename_dst_nr - rename_count );
765+ num_sources = rename_src_nr ; /* rename_src_nr reflects lower number */
535766
536767 /* All done? */
537- if (!num_destinations )
768+ if (!num_destinations || ! num_sources )
538769 goto cleanup ;
539770
540- switch (too_many_rename_candidates (num_destinations , rename_src_nr ,
771+ switch (too_many_rename_candidates (num_destinations , num_sources ,
541772 options )) {
542773 case 1 :
543774 goto cleanup ;
@@ -553,7 +784,7 @@ void diffcore_rename(struct diff_options *options)
553784 if (options -> show_rename_progress ) {
554785 progress = start_delayed_progress (
555786 _ ("Performing inexact rename detection" ),
556- (uint64_t )num_destinations * (uint64_t )rename_src_nr );
787+ (uint64_t )num_destinations * (uint64_t )num_sources );
557788 }
558789
559790 mx = xcalloc (st_mult (NUM_CANDIDATE_PER_DST , num_destinations ),
@@ -563,7 +794,7 @@ void diffcore_rename(struct diff_options *options)
563794 struct diff_score * m ;
564795
565796 if (rename_dst [i ].is_rename )
566- continue ; /* dealt with exact match already. */
797+ continue ; /* exact or basename match already handled */
567798
568799 m = & mx [dst_cnt * NUM_CANDIDATE_PER_DST ];
569800 for (j = 0 ; j < NUM_CANDIDATE_PER_DST ; j ++ )
@@ -573,6 +804,8 @@ void diffcore_rename(struct diff_options *options)
573804 struct diff_filespec * one = rename_src [j ].p -> one ;
574805 struct diff_score this_src ;
575806
807+ assert (!one -> rename_used || want_copies || break_idx );
808+
576809 if (skip_unmodified &&
577810 diff_unmodified_pair (rename_src [j ].p ))
578811 continue ;
@@ -594,15 +827,15 @@ void diffcore_rename(struct diff_options *options)
594827 }
595828 dst_cnt ++ ;
596829 display_progress (progress ,
597- (uint64_t )dst_cnt * (uint64_t )rename_src_nr );
830+ (uint64_t )dst_cnt * (uint64_t )num_sources );
598831 }
599832 stop_progress (& progress );
600833
601834 /* cost matrix sorted by most to least similar pair */
602835 STABLE_QSORT (mx , dst_cnt * NUM_CANDIDATE_PER_DST , score_compare );
603836
604837 rename_count += find_renames (mx , dst_cnt , minimum_score , 0 );
605- if (detect_rename == DIFF_DETECT_COPY )
838+ if (want_copies )
606839 rename_count += find_renames (mx , dst_cnt , minimum_score , 1 );
607840 free (mx );
608841 trace2_region_leave ("diff" , "inexact renames" , options -> repo );
0 commit comments