Skip to content

Commit ec5a039

Browse files
committed
*(XWiki)UserLinkAdditionFinder: filter out government domains.
*Arabic Wikipedia main page was moved.
1 parent 1d78394 commit ec5a039

File tree

4 files changed

+70
-31
lines changed

4 files changed

+70
-31
lines changed

src/org/wikipedia/tools/UserLinkAdditionFinder.java

Lines changed: 53 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/**
2-
* @(#)UserLinkAdditionFinder.java 0.02 05/11/2017
3-
* Copyright (C) 2015-2017 MER-C
2+
* @(#)UserLinkAdditionFinder.java 0.03 15/06/2024
3+
* Copyright (C) 2015-2024 MER-C
44
*
55
* This program is free software; you can redistribute it and/or
66
* modify it under the terms of the GNU General Public License
@@ -29,12 +29,15 @@
2929
/**
3030
* Finds links added by a user in the main namespace.
3131
* @author MER-C
32-
* @version 0.02
32+
* @version 0.03
3333
*/
3434
public class UserLinkAdditionFinder
3535
{
3636
private final WMFWiki wiki;
37+
private final Pages pages;
38+
private final ExternalLinks el;
3739
private static final WMFWikiFarm sessions = WMFWikiFarm.instance();
40+
private final List<Pattern> whitelist_regexes = new ArrayList<>();
3841

3942
/**
4043
* Runs this program.
@@ -81,40 +84,19 @@ public static void main(String[] args) throws IOException
8184
System.out.println("No links found.");
8285
System.exit(0);
8386
}
84-
8587
Map<String, String> linkdomains = new HashMap<>();
8688
for (Map.Entry<Wiki.Revision, List<String>> entry : results.entrySet())
8789
{
8890
for (String link : entry.getValue())
8991
{
9092
String domain = ExternalLinks.extractDomain(link);
91-
if (domain != null) // must be parseable
92-
{
93-
boolean nomatch = true;
94-
for (String wmfsite : WMFWikiFarm.WMF_DOMAINS)
95-
if (domain.endsWith(wmfsite))
96-
nomatch = false;
97-
if (nomatch)
98-
linkdomains.put(link, domain);
99-
}
100-
}
101-
}
102-
103-
// remove blacklisted links
104-
Collection<String> domains = new TreeSet(linkdomains.values());
105-
ExternalLinks el = ExternalLinks.of(thiswiki);
106-
if (removeblacklisted)
107-
{
108-
Iterator<String> iter = domains.iterator();
109-
while (iter.hasNext())
110-
{
111-
String link = iter.next();
112-
if (el.isSpamBlacklisted(linkdomains.get(link)))
113-
iter.remove();
93+
if (domain != null && !finder.canSkipDomain(domain, removeblacklisted))
94+
linkdomains.put(link, domain);
11495
}
11596
}
116-
97+
11798
// remove commonly used domains
99+
Collection<String> domains = new TreeSet(linkdomains.values());
118100
Map<String, Integer> linkcounts = null;
119101
if (linksearch)
120102
{
@@ -149,6 +131,21 @@ public static void main(String[] args) throws IOException
149131
public UserLinkAdditionFinder(WMFWiki wiki)
150132
{
151133
this.wiki = wiki;
134+
this.pages = Pages.of(wiki);
135+
this.el = ExternalLinks.of(wiki);
136+
137+
String[] regex = new String[]
138+
{
139+
".*\\.(?:gov|int|mil)$",
140+
"(.*\\.)?gov\\.(?:au|br|cn|ie|in|il|ph|ru|scot|sg|ua|uk|wales|za)$",
141+
"(.*\\.)?gob\\.(?:ar|cl|es|mx|pe)$",
142+
"(.*\\.)?(?:bl|judiciary|mod|nhs|parliament|police|royal)\\.uk$",
143+
"(.*\\.)?\\bgouv\\.fr",
144+
"(.*\\.)?\\bgovt\\.nz$",
145+
"(.*\\.)?\\beuropa\\.eu$"
146+
};
147+
for (String r : regex)
148+
whitelist_regexes.add(Pattern.compile(r));
152149
}
153150

154151
/**
@@ -191,6 +188,33 @@ public Map<Wiki.Revision, List<String>> getLinksAdded(List<String> users, Offset
191188
return results;
192189
}
193190

191+
/**
192+
* Filters added links for inclusion in search results.
193+
*
194+
* @param domain the domain to check
195+
* @param removeblacklisted remove already blacklisted links
196+
* @return whether this domain can be skipped for the purpose of this
197+
* search
198+
* @throws IOException if a network error occurs when fetching the spam
199+
* blacklist (highly unlikely)
200+
* @since 0.03
201+
*/
202+
public boolean canSkipDomain(String domain, boolean removeblacklisted) throws IOException
203+
{
204+
// WMF domains
205+
for (String wmfsite : WMFWikiFarm.WMF_DOMAINS)
206+
if (domain.endsWith(wmfsite))
207+
return true;
208+
// government domains
209+
for (Pattern p : whitelist_regexes)
210+
if (p.matcher(domain).matches())
211+
return true;
212+
// blacklisted domains
213+
if (removeblacklisted && el.isSpamBlacklisted(domain))
214+
return true;
215+
return false;
216+
}
217+
194218
/**
195219
* For a map that contains revision data &#8594; links added in that
196220
* revision, check whether the links still exist in the current version of
@@ -216,7 +240,7 @@ public Map<String, Map<String, Boolean>> checkIfLinksAreStillPresent(Map<Wiki.Re
216240
}
217241
list.addAll(listoflinks);
218242
});
219-
return Pages.of(wiki).containExternalLinks(resultsbypage);
243+
return pages.containExternalLinks(resultsbypage);
220244
}
221245

222246
public String outputWikitableResults(Map<Wiki.Revision, List<String>> data,

src/org/wikipedia/tools/XWikiUserLinkAdditionFinder.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ public static void main(String[] args) throws Exception
7878
for (String link : entry.getValue())
7979
{
8080
String domain = ExternalLinks.extractDomain(link);
81-
if (domain != null) // must be parseable
81+
if (domain != null && !finder.canSkipDomain(domain, false)) // must be parseable
8282
linkdomains.put(link, domain);
8383
}
8484
}

test/org/wikipedia/WikiTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,7 @@ public void resolveRedirects() throws Exception
369369
List<String> expected = List.of("Main Page", "Main Page", "Sdkghsdklg",
370370
"Goatse.cx", "Main Page", "Fish and chips", "Fish and chips");
371371
assertEquals(expected, enWiki.resolveRedirects(titles));
372-
assertEquals(List.of("الصفحة الرئيسية"), arWiki.resolveRedirects(List.of("الصفحه الرئيسيه")), "rtl");
372+
assertEquals(List.of("الصفحة الرئيسة"), arWiki.resolveRedirects(List.of("الصفحه الرئيسيه")), "rtl");
373373
}
374374

375375
@Test

test/org/wikipedia/tools/UserLinkAdditionFinderTest.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,21 @@ public void getWiki() throws Exception
5454
assertEquals("test.wikipedia.org", finder_test.getWiki().getDomain());
5555
assertEquals("en.wikipedia.org", finder_en.getWiki().getDomain());
5656
}
57+
58+
@Test
59+
public void canSkipDomain() throws Exception
60+
{
61+
List<String> domains = List.of("en.wikipedia.org", "www.fda.gov", "nasa.gov",
62+
"army.mil", "www.consilium.europa.eu", "gov.uk", "wa.gov.au", "europa.eu",
63+
"govt.nz", "www.govt.nz", "bl.uk", "parliament.uk", "un.int");
64+
for (String domain : domains)
65+
assertTrue(finder_en.canSkipDomain(domain, true), "domain: " + domain);
66+
67+
domains = List.of("www.example.com", "blah.gov.invalid", "fakegov.uk", "blahbl.uk",
68+
"fake-gov.uk");
69+
for (String domain : domains)
70+
assertFalse(finder_en.canSkipDomain(domain, true), "domain: " + domain);
71+
}
5772

5873
@Test
5974
public void getLinksAdded() throws Exception

0 commit comments

Comments
 (0)