Skip to content

Commit fe1ec51

Browse files
taserzclaude
andcommitted
Add Smart Quotes canonicizer
Closes #112. Adds a canonicizer that replaces Unicode smart/curly quotes with plain ASCII equivalents. Word processors automatically substitute typographic quotes for straight ones, so the same phrase can tokenize differently depending on where the text came from. This normalizes that before feature extraction. Characters handled: - U+2018, U+2019, U+201B (curly single quotes, high-reversed-9) -> ' - U+201C, U+201D, U+201E, U+201F (curly double quotes, low/high-9) -> " - U+00AB, U+00BB (double angle quotation marks) -> " - U+2039, U+203A (single angle quotation marks) -> ' - U+2032, U+2033 (prime, double prime) -> ', " Shows up in the GUI. Unit tests included. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent f6757d1 commit fe1ec51

2 files changed

Lines changed: 122 additions & 0 deletions

File tree

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/*
2+
* JGAAP -- a graphical program for stylometric authorship attribution
3+
* Copyright (C) 2009,2011 by Patrick Juola
4+
*
5+
* This program is free software: you can redistribute it and/or modify
6+
* it under the terms of the GNU Affero General Public License as
7+
* published by the Free Software Foundation, either version 3 of the
8+
* License, or (at your option) any later version.
9+
*
10+
* This program is distributed in the hope that it will be useful,
11+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
* GNU Affero General Public License for more details.
14+
*
15+
* You should have received a copy of the GNU Affero General Public License
16+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
17+
*/
18+
package com.jgaap.canonicizers;
19+
20+
import com.jgaap.generics.Canonicizer;
21+
22+
/**
23+
* Replaces Unicode smart/curly quotes with plain ASCII equivalents so that
24+
* typographic and straight quotes are treated identically during feature
25+
* extraction.
26+
*/
27+
public class SmartQuotes extends Canonicizer {
28+
29+
@Override
30+
public String displayName() {
31+
return "Smart Quotes";
32+
}
33+
34+
@Override
35+
public String tooltipText() {
36+
return "Replace Unicode smart/curly quotes with plain ASCII quote characters.";
37+
}
38+
39+
@Override
40+
public boolean showInGUI() {
41+
return true;
42+
}
43+
44+
@Override
45+
public char[] process(char[] procText) {
46+
StringBuilder sb = new StringBuilder(procText.length);
47+
for (char c : procText) {
48+
switch (c) {
49+
case '‘': // ' left single quotation mark
50+
case '’': // ' right single quotation mark
51+
case '‛': // ‛ single high-reversed-9 quotation mark
52+
case '′': // ′ prime
53+
sb.append('\'');
54+
break;
55+
case '“': // " left double quotation mark
56+
case '”': // " right double quotation mark
57+
case '„': // „ double low-9 quotation mark
58+
case '‟': // ‟ double high-reversed-9 quotation mark
59+
case '″': // ″ double prime
60+
case '«': // « left-pointing double angle quotation mark
61+
case '»': // » right-pointing double angle quotation mark
62+
sb.append('"');
63+
break;
64+
case '‹': // ‹ single left-pointing angle quotation mark
65+
case '›': // › single right-pointing angle quotation mark
66+
sb.append('\'');
67+
break;
68+
default:
69+
sb.append(c);
70+
}
71+
}
72+
return sb.toString().toCharArray();
73+
}
74+
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
package com.jgaap.canonicizers;
2+
3+
import static org.junit.Assert.*;
4+
5+
import java.util.Arrays;
6+
7+
import org.junit.Test;
8+
9+
public class SmartQuotesTest {
10+
11+
@Test
12+
public void testProcess() {
13+
SmartQuotes canon = new SmartQuotes();
14+
15+
// Curly single quotes → '
16+
assertTrue(Arrays.equals(new char[]{'\''},
17+
canon.process(new char[]{'‘'})));
18+
assertTrue(Arrays.equals(new char[]{'\''},
19+
canon.process(new char[]{'’'})));
20+
21+
// Curly double quotes → "
22+
assertTrue(Arrays.equals(new char[]{'"'},
23+
canon.process(new char[]{'“'})));
24+
assertTrue(Arrays.equals(new char[]{'"'},
25+
canon.process(new char[]{'”'})));
26+
27+
// Double low-9 quotation mark → "
28+
assertTrue(Arrays.equals(new char[]{'"'},
29+
canon.process(new char[]{'„'})));
30+
31+
// Angle quotation marks → "
32+
assertTrue(Arrays.equals(new char[]{'"'},
33+
canon.process(new char[]{'«'})));
34+
assertTrue(Arrays.equals(new char[]{'"'},
35+
canon.process(new char[]{'»'})));
36+
37+
// Plain ASCII characters pass through unchanged
38+
String plain = "Hello, \"world\"! It's fine.";
39+
assertTrue(Arrays.equals(plain.toCharArray(),
40+
canon.process(plain.toCharArray())));
41+
42+
// Mixed smart and plain text
43+
String input = "“Hello” ‘world’";
44+
String expected = "\"Hello\" 'world'";
45+
assertTrue(Arrays.equals(expected.toCharArray(),
46+
canon.process(input.toCharArray())));
47+
}
48+
}

0 commit comments

Comments
 (0)