2525
2626
2727#define XDL_KPDIS_RUN 4
28+ #define XDL_MAX_EQLIMIT 1024
2829
2930
3031
@@ -305,26 +306,48 @@ void xdl_free_env(xdfenv_t *xe) {
305306
306307
307308static int xdl_clean_mmatch (char const * dis , long i , long s , long e ) {
308- long r , rdis , rpdis ;
309-
310- for (r = 1 , rdis = 0 , rpdis = 1 ; (i - r ) >= s ; r ++ ) {
309+ long r , rdis0 , rpdis0 , rdis1 , rpdis1 ;
310+
311+ /*
312+ * Scans the lines before 'i' to find a run of lines that either
313+ * have no match (dis[j] == 0) or have multiple matches (dis[j] > 1).
314+ * Note that we always call this function with dis[i] > 1, so the
315+ * current line (i) is already a multimatch line.
316+ */
317+ for (r = 1 , rdis0 = 0 , rpdis0 = 1 ; (i - r ) >= s ; r ++ ) {
311318 if (!dis [i - r ])
312- rdis ++ ;
319+ rdis0 ++ ;
313320 else if (dis [i - r ] == 2 )
314- rpdis ++ ;
321+ rpdis0 ++ ;
315322 else
316323 break ;
317324 }
318- for (r = 1 ; (i + r ) <= e ; r ++ ) {
325+ /*
326+ * If the run before the line 'i' found only multimatch lines, we
327+ * return 0 and hence we don't make the current line (i) discarded.
328+ * We want to discard multimatch lines only when they appear in the
329+ * middle of runs with nomatch lines (dis[j] == 0).
330+ */
331+ if (rdis0 == 0 )
332+ return 0 ;
333+ for (r = 1 , rdis1 = 0 , rpdis1 = 1 ; (i + r ) <= e ; r ++ ) {
319334 if (!dis [i + r ])
320- rdis ++ ;
335+ rdis1 ++ ;
321336 else if (dis [i + r ] == 2 )
322- rpdis ++ ;
337+ rpdis1 ++ ;
323338 else
324339 break ;
325340 }
326-
327- return rpdis * XDL_KPDIS_RUN < (rpdis + rdis );
341+ /*
342+ * If the run after the line 'i' found only multimatch lines, we
343+ * return 0 and hence we don't make the current line (i) discarded.
344+ */
345+ if (rdis1 == 0 )
346+ return 0 ;
347+ rdis1 += rdis0 ;
348+ rpdis1 += rpdis0 ;
349+
350+ return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1 );
328351}
329352
330353
@@ -334,34 +357,40 @@ static int xdl_clean_mmatch(char const *dis, long i, long s, long e) {
334357 * might be potentially discarded if they happear in a run of discardable.
335358 */
336359static int xdl_cleanup_records (xdfile_t * xdf1 , xdfile_t * xdf2 ) {
337- long i , rhi , nreff ;
360+ long i , nm , rhi , nreff , mlim ;
338361 unsigned long hav ;
339362 xrecord_t * * recs ;
340363 xrecord_t * rec ;
341364 char * dis , * dis1 , * dis2 ;
342365
343- if (!(dis = (char * ) xdl_malloc (( xdf1 -> nrec + xdf2 -> nrec + 2 ) * sizeof ( char ) ))) {
366+ if (!(dis = (char * ) xdl_malloc (xdf1 -> nrec + xdf2 -> nrec + 2 ))) {
344367
345368 return -1 ;
346369 }
347- memset (dis , 0 , ( xdf1 -> nrec + xdf2 -> nrec + 2 ) * sizeof ( char ) );
370+ memset (dis , 0 , xdf1 -> nrec + xdf2 -> nrec + 2 );
348371 dis1 = dis ;
349372 dis2 = dis1 + xdf1 -> nrec + 1 ;
350373
374+ if ((mlim = xdl_bogosqrt (xdf1 -> nrec )) > XDL_MAX_EQLIMIT )
375+ mlim = XDL_MAX_EQLIMIT ;
351376 for (i = xdf1 -> dstart , recs = & xdf1 -> recs [xdf1 -> dstart ]; i <= xdf1 -> dend ; i ++ , recs ++ ) {
352377 hav = (* recs )-> ha ;
353378 rhi = (long ) XDL_HASHLONG (hav , xdf2 -> hbits );
354- for (rec = xdf2 -> rhash [rhi ]; rec ; rec = rec -> next )
355- if (rec -> ha == hav && ++ dis1 [ i ] == 2 )
379+ for (nm = 0 , rec = xdf2 -> rhash [rhi ]; rec ; rec = rec -> next )
380+ if (rec -> ha == hav && ++ nm == mlim )
356381 break ;
382+ dis1 [i ] = (nm == 0 ) ? 0 : (nm >= mlim ) ? 2 : 1 ;
357383 }
358384
385+ if ((mlim = xdl_bogosqrt (xdf2 -> nrec )) > XDL_MAX_EQLIMIT )
386+ mlim = XDL_MAX_EQLIMIT ;
359387 for (i = xdf2 -> dstart , recs = & xdf2 -> recs [xdf2 -> dstart ]; i <= xdf2 -> dend ; i ++ , recs ++ ) {
360388 hav = (* recs )-> ha ;
361389 rhi = (long ) XDL_HASHLONG (hav , xdf1 -> hbits );
362- for (rec = xdf1 -> rhash [rhi ]; rec ; rec = rec -> next )
363- if (rec -> ha == hav && ++ dis2 [ i ] == 2 )
390+ for (nm = 0 , rec = xdf1 -> rhash [rhi ]; rec ; rec = rec -> next )
391+ if (rec -> ha == hav && ++ nm == mlim )
364392 break ;
393+ dis2 [i ] = (nm == 0 ) ? 0 : (nm >= mlim ) ? 2 : 1 ;
365394 }
366395
367396 for (nreff = 0 , i = xdf1 -> dstart , recs = & xdf1 -> recs [xdf1 -> dstart ];
0 commit comments