11/**
2- * @(#)UserLinkAdditionFinder.java 0.02 05/11/2017
3- * Copyright (C) 2015-2017 MER-C
2+ * @(#)UserLinkAdditionFinder.java 0.03 15/06/2024
3+ * Copyright (C) 2015-2024 MER-C
44 *
55 * This program is free software; you can redistribute it and/or
66 * modify it under the terms of the GNU General Public License
2929/**
3030 * Finds links added by a user in the main namespace.
3131 * @author MER-C
32- * @version 0.02
32+ * @version 0.03
3333 */
3434public class UserLinkAdditionFinder
3535{
3636 private final WMFWiki wiki ;
37+ private final Pages pages ;
38+ private final ExternalLinks el ;
3739 private static final WMFWikiFarm sessions = WMFWikiFarm .instance ();
40+ private final List <Pattern > whitelist_regexes = new ArrayList <>();
3841
3942 /**
4043 * Runs this program.
@@ -81,40 +84,19 @@ public static void main(String[] args) throws IOException
8184 System .out .println ("No links found." );
8285 System .exit (0 );
8386 }
84-
8587 Map <String , String > linkdomains = new HashMap <>();
8688 for (Map .Entry <Wiki .Revision , List <String >> entry : results .entrySet ())
8789 {
8890 for (String link : entry .getValue ())
8991 {
9092 String domain = ExternalLinks .extractDomain (link );
91- if (domain != null ) // must be parseable
92- {
93- boolean nomatch = true ;
94- for (String wmfsite : WMFWikiFarm .WMF_DOMAINS )
95- if (domain .endsWith (wmfsite ))
96- nomatch = false ;
97- if (nomatch )
98- linkdomains .put (link , domain );
99- }
100- }
101- }
102-
103- // remove blacklisted links
104- Collection <String > domains = new TreeSet (linkdomains .values ());
105- ExternalLinks el = ExternalLinks .of (thiswiki );
106- if (removeblacklisted )
107- {
108- Iterator <String > iter = domains .iterator ();
109- while (iter .hasNext ())
110- {
111- String link = iter .next ();
112- if (el .isSpamBlacklisted (linkdomains .get (link )))
113- iter .remove ();
93+ if (domain != null && !finder .canSkipDomain (domain , removeblacklisted ))
94+ linkdomains .put (link , domain );
11495 }
11596 }
116-
97+
11798 // remove commonly used domains
99+ Collection <String > domains = new TreeSet (linkdomains .values ());
118100 Map <String , Integer > linkcounts = null ;
119101 if (linksearch )
120102 {
@@ -149,6 +131,21 @@ public static void main(String[] args) throws IOException
149131 public UserLinkAdditionFinder (WMFWiki wiki )
150132 {
151133 this .wiki = wiki ;
134+ this .pages = Pages .of (wiki );
135+ this .el = ExternalLinks .of (wiki );
136+
137+ String [] regex = new String []
138+ {
139+ ".*\\ .(?:gov|int|mil)$" ,
140+ "(.*\\ .)?gov\\ .(?:au|br|cn|ie|in|il|ph|ru|scot|sg|ua|uk|wales|za)$" ,
141+ "(.*\\ .)?gob\\ .(?:ar|cl|es|mx|pe)$" ,
142+ "(.*\\ .)?(?:bl|judiciary|mod|nhs|parliament|police|royal)\\ .uk$" ,
143+ "(.*\\ .)?\\ bgouv\\ .fr" ,
144+ "(.*\\ .)?\\ bgovt\\ .nz$" ,
145+ "(.*\\ .)?\\ beuropa\\ .eu$"
146+ };
147+ for (String r : regex )
148+ whitelist_regexes .add (Pattern .compile (r ));
152149 }
153150
154151 /**
@@ -191,6 +188,33 @@ public Map<Wiki.Revision, List<String>> getLinksAdded(List<String> users, Offset
191188 return results ;
192189 }
193190
191+ /**
192+ * Filters added links for inclusion in search results.
193+ *
194+ * @param domain the domain to check
195+ * @param removeblacklisted remove already blacklisted links
196+ * @return whether this domain can be skipped for the purpose of this
197+ * search
198+ * @throws IOException if a network error occurs when fetching the spam
199+ * blacklist (highly unlikely)
200+ * @since 0.03
201+ */
202+ public boolean canSkipDomain (String domain , boolean removeblacklisted ) throws IOException
203+ {
204+ // WMF domains
205+ for (String wmfsite : WMFWikiFarm .WMF_DOMAINS )
206+ if (domain .endsWith (wmfsite ))
207+ return true ;
208+ // government domains
209+ for (Pattern p : whitelist_regexes )
210+ if (p .matcher (domain ).matches ())
211+ return true ;
212+ // blacklisted domains
213+ if (removeblacklisted && el .isSpamBlacklisted (domain ))
214+ return true ;
215+ return false ;
216+ }
217+
194218 /**
195219 * For a map that contains revision data → links added in that
196220 * revision, check whether the links still exist in the current version of
@@ -216,7 +240,7 @@ public Map<String, Map<String, Boolean>> checkIfLinksAreStillPresent(Map<Wiki.Re
216240 }
217241 list .addAll (listoflinks );
218242 });
219- return Pages . of ( wiki ) .containExternalLinks (resultsbypage );
243+ return pages .containExternalLinks (resultsbypage );
220244 }
221245
222246 public String outputWikitableResults (Map <Wiki .Revision , List <String >> data ,
0 commit comments