Skip to content

Commit 18b7ac4

Browse files
committed
Working reasonably well at cleansing text. Note that we use 'punctuation' to delimit words, so any intra-word punctuation will break a match.
0 parents  commit 18b7ac4

File tree

6 files changed

+224
-0
lines changed

6 files changed

+224
-0
lines changed

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
.idea/
2+
CMakeCache.txt
3+
CMakeFiles/
4+
cmake_install.cmake
5+
Makefile
6+
tests

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
set (CMAKE_CXX_STANDARD 11)
2+
3+
configure_file(wordlist.txt wordlist.txt COPYONLY)
4+
add_executable(tests cppure.cpp tests.cpp)

cppure.cpp

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
#include "cppure.h"
2+
3+
#include <fstream>
4+
#include <set>
5+
#include <vector>
6+
7+
using namespace std;
8+
9+
namespace cppure
10+
{
11+
// If "a" is an impure word, ensure "a b" is not also a wordlist option
12+
// Otherwise, we will let "a" pass through if we see the combination "a c"
13+
vector<string> wordlist;
14+
15+
enum state {
16+
WaitForWord,
17+
ProcessingWord
18+
};
19+
20+
bool load_wordlist(string const& path)
21+
{
22+
wordlist.clear();
23+
24+
ifstream ifs(path);
25+
if (!ifs.good()) return false;
26+
27+
string word;
28+
while (ifs.good()) {
29+
getline(ifs, word);
30+
wordlist.push_back(word);
31+
}
32+
33+
return true;
34+
}
35+
36+
bool icase_match(string const& buf, set<int> const& possible) {
37+
// We can process the remainder of the buffer now (the last word)
38+
for (int i : possible) {
39+
// Must be same length
40+
if (buf.size() != wordlist[i].size()) continue;
41+
42+
// Check for full match between buffer and wordlist possibility
43+
bool match = true;
44+
for (int j = 0; j < buf.size(); ++j) {
45+
if (tolower(buf[j]) != tolower(wordlist[i][j])) {
46+
match = false;
47+
break;
48+
}
49+
}
50+
51+
if (match) return true;
52+
}
53+
54+
return false;
55+
}
56+
57+
string cleanse(string const& str)
58+
{
59+
state st = WaitForWord;
60+
61+
string ret, buf;
62+
set<int> all_possible, possible;
63+
64+
for (int i = 0; i < wordlist.size(); ++i)
65+
all_possible.insert(i);
66+
67+
// Walk once through input string
68+
for (size_t idx = 0; idx < str.size(); ++idx) {
69+
70+
char c = str[idx];
71+
72+
// Is this character a space or punctuation or are we at eof?
73+
bool iss = isspace(c);
74+
bool isp = ispunct(c);
75+
76+
// Still waiting for start of word...
77+
if (iss && (st == WaitForWord)) {
78+
ret += c;
79+
continue;
80+
}
81+
82+
// Starting a word!
83+
if (!iss && (st == WaitForWord)) {
84+
st = ProcessingWord;
85+
possible = all_possible;
86+
}
87+
88+
// Processing a word...
89+
if (st == ProcessingWord) {
90+
91+
// If no more possibilities remain
92+
// then we can add what we have in the buffer to the output
93+
if (possible.empty()) {
94+
buf += c;
95+
96+
// We can clear out the buffer and start over
97+
if (iss || isp) {
98+
ret += buf;
99+
buf.clear();
100+
st = WaitForWord;
101+
}
102+
103+
continue;
104+
}
105+
106+
// We've encountered a new space, punctuation, or eof character, so we process this word
107+
if (iss || isp) {
108+
109+
// We match a possible word, so replace our buffer
110+
if (icase_match(buf, possible)) {
111+
fill(buf.begin(), buf.end(), '*');
112+
}
113+
114+
buf += c;
115+
ret += buf;
116+
buf.clear();
117+
st = WaitForWord;
118+
continue;
119+
}
120+
121+
// Don't yet have a match, so add the character
122+
buf += c;
123+
124+
// Filter remaining possibilities
125+
for (auto it = possible.begin(); it != possible.end();) {
126+
// Current string is too long for match
127+
if (wordlist[*it].size() < buf.size()) {
128+
it = possible.erase(it);
129+
continue;
130+
}
131+
132+
// Current character doesn't match (in lowercase)
133+
if (tolower(wordlist[*it][buf.size()-1]) != tolower(buf[buf.size()-1])) {
134+
it = possible.erase(it);
135+
continue;
136+
}
137+
138+
// Next possible word
139+
++it;
140+
}
141+
}
142+
}
143+
144+
// Process the last word
145+
if (icase_match(buf, possible)) {
146+
fill(buf.begin(), buf.end(), '*');
147+
}
148+
149+
ret += buf;
150+
151+
return ret;
152+
}
153+
}

cppure.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#include <string>
2+
3+
namespace cppure
4+
{
5+
bool load_wordlist(std::string const& path);
6+
std::string cleanse(std::string const& str);
7+
}

tests.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#include "cppure.h"
2+
3+
#include <cassert>
4+
5+
int main(int argc, char const* argv[])
6+
{
7+
assert(cppure::load_wordlist("wordlist.txt"));
8+
assert(cppure::cleanse("ass") == "***");
9+
assert(cppure::cleanse("a.s.s") == "a.s.s");
10+
assert(cppure::cleanse("ASS") == "***");
11+
assert(cppure::cleanse("Ass") == "***");
12+
assert(cppure::cleanse("aSS") == "***");
13+
assert(cppure::cleanse(" ass") == " ***");
14+
assert(cppure::cleanse(" ass") == " ***");
15+
assert(cppure::cleanse("ass.") == "***.");
16+
assert(cppure::cleanse("ass. ") == "***. ");
17+
assert(cppure::cleanse(" ass. ") == " ***. ");
18+
assert(cppure::cleanse(".ass.") == ".***.");
19+
assert(cppure::cleanse("ass!") == "***!");
20+
assert(cppure::cleanse("you ass") == "you ***");
21+
assert(cppure::cleanse("tit") == "***");
22+
assert(cppure::cleanse("title") == "title");
23+
assert(cppure::cleanse("ass tit") == "*** ***");
24+
}

wordlist.txt

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
ass
2+
asses
3+
cock
4+
cocks
5+
cunt
6+
cunts
7+
dick
8+
dicks
9+
fag
10+
fags
11+
fuck
12+
fucks
13+
fucker
14+
fuckers
15+
gay
16+
gays
17+
homo
18+
homos
19+
nigger
20+
niggers
21+
pussy
22+
pussies
23+
shit
24+
shits
25+
tit
26+
tits
27+
twat
28+
twats
29+
whore
30+
whores

0 commit comments

Comments
 (0)