@@ -1211,20 +1211,87 @@ sub do_fetch {
12111211sub mkemptydirs {
12121212 my ($self , $r ) = @_ ;
12131213
1214+ # add/remove/collect a paths table
1215+ #
1216+ # Paths are split into a tree of nodes, stored as a hash of hashes.
1217+ #
1218+ # Each node contains a 'path' entry for the path (if any) associated
1219+ # with that node and a 'children' entry for any nodes under that
1220+ # location.
1221+ #
1222+ # Removing a path requires a hash lookup for each component then
1223+ # dropping that node (and anything under it), which is substantially
1224+ # faster than a grep slice into a single hash of paths for large
1225+ # numbers of paths.
1226+ #
1227+ # For a large (200K) number of empty_dir directives this reduces
1228+ # scanning time to 3 seconds vs 10 minutes for grep+delete on a single
1229+ # hash of paths.
1230+ sub add_path {
1231+ my ($paths_table , $path ) = @_ ;
1232+ my $node_ref ;
1233+
1234+ foreach my $x (split (' /' , $path )) {
1235+ if (!exists ($paths_table -> {$x })) {
1236+ $paths_table -> {$x } = { children => {} };
1237+ }
1238+
1239+ $node_ref = $paths_table -> {$x };
1240+ $paths_table = $paths_table -> {$x }-> {children };
1241+ }
1242+
1243+ $node_ref -> {path } = $path ;
1244+ }
1245+
1246+ sub remove_path {
1247+ my ($paths_table , $path ) = @_ ;
1248+ my $nodes_ref ;
1249+ my $node_name ;
1250+
1251+ foreach my $x (split (' /' , $path )) {
1252+ if (!exists ($paths_table -> {$x })) {
1253+ return ;
1254+ }
1255+
1256+ $nodes_ref = $paths_table ;
1257+ $node_name = $x ;
1258+
1259+ $paths_table = $paths_table -> {$x }-> {children };
1260+ }
1261+
1262+ delete ($nodes_ref -> {$node_name });
1263+ }
1264+
1265+ sub collect_paths {
1266+ my ($paths_table , $paths_ref ) = @_ ;
1267+
1268+ foreach my $v (values %$paths_table ) {
1269+ my $p = $v -> {path };
1270+ my $c = $v -> {children };
1271+
1272+ collect_paths($c , $paths_ref );
1273+
1274+ if (defined ($p )) {
1275+ push (@$paths_ref , $p );
1276+ }
1277+ }
1278+ }
1279+
12141280 sub scan {
1215- my ($r , $empty_dirs , $line ) = @_ ;
1281+ my ($r , $paths_table , $line ) = @_ ;
12161282 if (defined $r && $line =~ / ^r(\d +)$ / ) {
12171283 return 0 if $1 > $r ;
12181284 } elsif ($line =~ / ^ \+ empty_dir: (.+)$ / ) {
1219- $empty_dirs -> { $1 } = 1 ;
1285+ add_path( $paths_table , $1 ) ;
12201286 } elsif ($line =~ / ^ \- empty_dir: (.+)$ / ) {
1221- my @d = grep {m [^\Q$1 \E(/ |$) ]} (keys %$empty_dirs );
1222- delete @$empty_dirs {@d };
1287+ remove_path($paths_table , $1 );
12231288 }
12241289 1; # continue
12251290 };
12261291
1227- my %empty_dirs = ();
1292+ my @empty_dirs ;
1293+ my %paths_table ;
1294+
12281295 my $gz_file = " $self ->{dir}/unhandled.log.gz" ;
12291296 if (-f $gz_file ) {
12301297 if (!can_compress()) {
@@ -1235,7 +1302,7 @@ sub mkemptydirs {
12351302 die " Unable to open $gz_file : $! \n " ;
12361303 my $line ;
12371304 while ($gz -> gzreadline($line ) > 0) {
1238- scan($r , \% empty_dirs , $line ) or last;
1305+ scan($r , \%paths_table , $line ) or last ;
12391306 }
12401307 $gz -> gzclose;
12411308 }
@@ -1244,13 +1311,14 @@ sub mkemptydirs {
12441311 if (open my $fh , ' <' , " $self ->{dir}/unhandled.log" ) {
12451312 binmode $fh or croak " binmode: $! " ;
12461313 while (<$fh >) {
1247- scan($r , \% empty_dirs , $_ ) or last;
1314+ scan($r , \%paths_table , $_ ) or last ;
12481315 }
12491316 close $fh ;
12501317 }
12511318
1319+ collect_paths(\%paths_table , \@empty_dirs );
12521320 my $strip = qr /\A\Q @{[$self ->path]}\E (?:\/ |$ )/ ;
1253- foreach my $d (sort keys % empty_dirs ) {
1321+ foreach my $d (sort @ empty_dirs ) {
12541322 $d = uri_decode($d );
12551323 $d =~ s / $strip// ;
12561324 next unless length ($d );
0 commit comments