77#include "delta.h"
88#include "count-delta.h"
99
10- static int very_different (struct diff_filespec * src ,
11- struct diff_filespec * dst ,
12- int min_score )
10+ static int should_break (struct diff_filespec * src ,
11+ struct diff_filespec * dst ,
12+ int break_score ,
13+ int * merge_score_p )
1314{
1415 /* dst is recorded as a modification of src. Are they so
1516 * different that we are better off recording this as a pair
16- * of delete and create? min_score is the minimum amount of
17- * new material that must exist in the dst and not in src for
18- * the pair to be considered a complete rewrite, and recommended
19- * to be set to a very high value, 99% or so.
17+ * of delete and create?
2018 *
21- * The value we return represents the amount of new material
22- * that is in dst and not in src. We return 0 when we do not
23- * want to get the filepair broken.
19+ * There are two criteria used in this algorithm. For the
20+ * purposes of helping later rename/copy, we take both delete
21+ * and insert into account and estimate the amount of "edit".
22+ * If the edit is very large, we break this pair so that
23+ * rename/copy can pick the pieces up to match with other
24+ * files.
25+ *
26+ * On the other hand, we would want to ignore inserts for the
27+ * pure "complete rewrite" detection. As long as most of the
28+ * existing contents were removed from the file, it is a
29+ * complete rewrite, and if sizable chunk from the original
30+ * still remains in the result, it is not a rewrite. It does
31+ * not matter how much or how little new material is added to
32+ * the file.
33+ *
34+ * The score we leave for such a broken filepair uses the
35+ * latter definition so that later clean-up stage can find the
36+ * pieces that should not have been broken according to the
37+ * latter definition after rename/copy runs, and merge the
38+ * broken pair that have a score lower than given criteria
39+ * back together. The break operation itself happens
40+ * according to the former definition.
41+ *
42+ * The minimum_edit parameter tells us when to break (the
43+ * amount of "edit" required for us to consider breaking the
44+ * pair). We leave the amount of deletion in *merge_score_p
45+ * when we return.
46+ *
47+ * The value we return is 1 if we want the pair to be broken,
48+ * or 0 if we do not.
2449 */
2550 void * delta ;
2651 unsigned long delta_size , base_size , src_copied , literal_added ;
52+ int to_break = 0 ;
53+
54+ * merge_score_p = 0 ; /* assume no deletion --- "do not break"
55+ * is the default.
56+ */
2757
2858 if (!S_ISREG (src -> mode ) || !S_ISREG (dst -> mode ))
2959 return 0 ; /* leave symlink rename alone */
3060
31- if (diff_populate_filespec (src , 1 ) || diff_populate_filespec (dst , 1 ))
61+ if (diff_populate_filespec (src , 0 ) || diff_populate_filespec (dst , 0 ))
3262 return 0 ; /* error but caught downstream */
3363
3464 delta_size = ((src -> size < dst -> size ) ?
@@ -40,53 +70,95 @@ static int very_different(struct diff_filespec *src,
4070 */
4171 base_size = ((src -> size < dst -> size ) ? dst -> size : src -> size );
4272
43- /*
44- * If file size difference is too big compared to the
45- * base_size, we declare this a complete rewrite.
46- */
47- if (base_size * min_score < delta_size * MAX_SCORE )
48- return MAX_SCORE ;
49-
50- if (diff_populate_filespec (src , 0 ) || diff_populate_filespec (dst , 0 ))
51- return 0 ; /* error but caught downstream */
52-
5373 delta = diff_delta (src -> data , src -> size ,
5474 dst -> data , dst -> size ,
5575 & delta_size );
5676
57- /* A delta that has a lot of literal additions would have
58- * big delta_size no matter what else it does.
59- */
60- if (base_size * min_score < delta_size * MAX_SCORE )
61- return MAX_SCORE ;
62-
6377 /* Estimate the edit size by interpreting delta. */
64- if (count_delta (delta , delta_size , & src_copied , & literal_added )) {
78+ if (count_delta (delta , delta_size ,
79+ & src_copied , & literal_added )) {
6580 free (delta );
66- return 0 ;
81+ return 0 ; /* we cannot tell */
6782 }
6883 free (delta );
6984
70- /* Extent of damage */
71- if (src -> size + literal_added < src_copied )
72- delta_size = 0 ;
85+ /* Compute merge-score, which is "how much is removed
86+ * from the source material". The clean-up stage will
87+ * merge the surviving pair together if the score is
88+ * less than the minimum, after rename/copy runs.
89+ */
90+ if (src -> size <= src_copied )
91+ delta_size = 0 ; /* avoid wrapping around */
92+ else
93+ delta_size = src -> size - src_copied ;
94+ * merge_score_p = delta_size * MAX_SCORE / src -> size ;
95+
96+ /* Extent of damage, which counts both inserts and
97+ * deletes.
98+ */
99+ if (src -> size + literal_added <= src_copied )
100+ delta_size = 0 ; /* avoid wrapping around */
73101 else
74102 delta_size = (src -> size - src_copied ) + literal_added ;
103+
104+ /* We break if the edit exceeds the minimum.
105+ * i.e. (break_score / MAX_SCORE < delta_size / base_size)
106+ */
107+ if (break_score * base_size < delta_size * MAX_SCORE )
108+ to_break = 1 ;
75109
76- if (base_size < delta_size )
77- return MAX_SCORE ;
78-
79- return delta_size * MAX_SCORE / base_size ;
110+ return to_break ;
80111}
81112
82- void diffcore_break (int min_score )
113+ void diffcore_break (int break_score )
83114{
84115 struct diff_queue_struct * q = & diff_queued_diff ;
85116 struct diff_queue_struct outq ;
117+
118+ /* When the filepair has this much edit (insert and delete),
119+ * it is first considered to be a rewrite and broken into a
120+ * create and delete filepair. This is to help breaking a
121+ * file that had too much new stuff added, possibly from
122+ * moving contents from another file, so that rename/copy can
123+ * match it with the other file.
124+ *
125+ * int break_score; we reuse incoming parameter for this.
126+ */
127+
128+ /* After a pair is broken according to break_score and
129+ * subjected to rename/copy, both of them may survive intact,
130+ * due to lack of suitable rename/copy peer. Or, the caller
131+ * may be calling us without using rename/copy. When that
132+ * happens, we merge the broken pieces back into one
133+ * modification together if the pair did not have more than
134+ * this much delete. For this computation, we do not take
135+ * insert into account at all. If you start from a 100-line
136+ * file and delete 97 lines of it, it does not matter if you
137+ * add 27 lines to it to make a new 30-line file or if you add
138+ * 997 lines to it to make a 1000-line file. Either way what
139+ * you did was a rewrite of 97%. On the other hand, if you
140+ * delete 3 lines, keeping 97 lines intact, it does not matter
141+ * if you add 3 lines to it to make a new 100-line file or if
142+ * you add 903 lines to it to make a new 1000-line file.
143+ * Either way you did a lot of additions and not a rewrite.
144+ * This merge happens to catch the latter case. A merge_score
145+ * of 80% would be a good default value (a broken pair that
146+ * has score lower than merge_score will be merged back
147+ * together).
148+ */
149+ int merge_score ;
86150 int i ;
87151
88- if (!min_score )
89- min_score = DEFAULT_BREAK_SCORE ;
152+ /* See comment on DEFAULT_BREAK_SCORE and
153+ * DEFAULT_MERGE_SCORE in diffcore.h
154+ */
155+ merge_score = (break_score >> 16 ) & 0xFFFF ;
156+ break_score = (break_score & 0xFFFF );
157+
158+ if (!break_score )
159+ break_score = DEFAULT_BREAK_SCORE ;
160+ if (!merge_score )
161+ merge_score = DEFAULT_MERGE_SCORE ;
90162
91163 outq .nr = outq .alloc = 0 ;
92164 outq .queue = NULL ;
@@ -101,12 +173,22 @@ void diffcore_break(int min_score)
101173 if (DIFF_FILE_VALID (p -> one ) && DIFF_FILE_VALID (p -> two ) &&
102174 !S_ISDIR (p -> one -> mode ) && !S_ISDIR (p -> two -> mode ) &&
103175 !strcmp (p -> one -> path , p -> two -> path )) {
104- score = very_different ( p -> one , p -> two , min_score );
105- if ( min_score <= score ) {
176+ if ( should_break ( p -> one , p -> two ,
177+ break_score , & score ) ) {
106178 /* Split this into delete and create */
107179 struct diff_filespec * null_one , * null_two ;
108180 struct diff_filepair * dp ;
109181
182+ /* Set score to 0 for the pair that
183+ * needs to be merged back together
184+ * should they survive rename/copy.
185+ * Also we do not want to break very
186+ * small files.
187+ */
188+ if ((score < merge_score ) ||
189+ (p -> one -> size < MINIMUM_BREAK_SIZE ))
190+ score = 0 ;
191+
110192 /* deletion of one */
111193 null_one = alloc_filespec (p -> one -> path );
112194 dp = diff_queue (& outq , p -> one , null_one );
@@ -132,3 +214,77 @@ void diffcore_break(int min_score)
132214
133215 return ;
134216}
217+
218+ static void merge_broken (struct diff_filepair * p ,
219+ struct diff_filepair * pp ,
220+ struct diff_queue_struct * outq )
221+ {
222+ /* p and pp are broken pairs we want to merge */
223+ struct diff_filepair * c = p , * d = pp ;
224+ if (DIFF_FILE_VALID (p -> one )) {
225+ /* this must be a delete half */
226+ d = p ; c = pp ;
227+ }
228+ /* Sanity check */
229+ if (!DIFF_FILE_VALID (d -> one ))
230+ die ("internal error in merge #1" );
231+ if (DIFF_FILE_VALID (d -> two ))
232+ die ("internal error in merge #2" );
233+ if (DIFF_FILE_VALID (c -> one ))
234+ die ("internal error in merge #3" );
235+ if (!DIFF_FILE_VALID (c -> two ))
236+ die ("internal error in merge #4" );
237+
238+ diff_queue (outq , d -> one , c -> two );
239+ diff_free_filespec_data (d -> two );
240+ diff_free_filespec_data (c -> one );
241+ free (d );
242+ free (c );
243+ }
244+
245+ void diffcore_merge_broken (void )
246+ {
247+ struct diff_queue_struct * q = & diff_queued_diff ;
248+ struct diff_queue_struct outq ;
249+ int i , j ;
250+
251+ outq .nr = outq .alloc = 0 ;
252+ outq .queue = NULL ;
253+
254+ for (i = 0 ; i < q -> nr ; i ++ ) {
255+ struct diff_filepair * p = q -> queue [i ];
256+ if (!p )
257+ /* we already merged this with its peer */
258+ continue ;
259+ else if (p -> broken_pair &&
260+ p -> score == 0 &&
261+ !strcmp (p -> one -> path , p -> two -> path )) {
262+ /* If the peer also survived rename/copy, then
263+ * we merge them back together.
264+ */
265+ for (j = i + 1 ; j < q -> nr ; j ++ ) {
266+ struct diff_filepair * pp = q -> queue [j ];
267+ if (pp -> broken_pair &&
268+ p -> score == 0 &&
269+ !strcmp (pp -> one -> path , pp -> two -> path ) &&
270+ !strcmp (p -> one -> path , pp -> two -> path )) {
271+ /* Peer survived. Merge them */
272+ merge_broken (p , pp , & outq );
273+ q -> queue [j ] = NULL ;
274+ break ;
275+ }
276+ }
277+ if (q -> nr <= j )
278+ /* The peer did not survive, so we keep
279+ * it in the output.
280+ */
281+ diff_q (& outq , p );
282+ }
283+ else
284+ diff_q (& outq , p );
285+ }
286+ free (q -> queue );
287+ * q = outq ;
288+
289+ return ;
290+ }
0 commit comments