Working reasonably well at cleansing text. Note that we use 'punctuation' to delimit words, so any intra-word punctuation will break a match.

aardvarkk · aardvarkk · commit 18b7ac4ef6c5 · 2016-10-18T13:43:24.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+.idea/
+CMakeCache.txt
+CMakeFiles/
+cmake_install.cmake
+Makefile
+tests
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,4 @@
+set (CMAKE_CXX_STANDARD 11)
+
+configure_file(wordlist.txt wordlist.txt COPYONLY)
+add_executable(tests cppure.cpp tests.cpp)
diff --git a/cppure.cpp b/cppure.cpp
@@ -0,0 +1,153 @@
+#include "cppure.h"
+
+#include <fstream>
+#include <set>
+#include <vector>
+
+using namespace std;
+
+namespace cppure
+{
+	// If "a" is an impure word, ensure "a b" is not also a wordlist option
+	// Otherwise, we will let "a" pass through if we see the combination "a c"
+	vector<string> wordlist;
+
+	enum state {
+		WaitForWord,
+		ProcessingWord
+	};
+
+	bool load_wordlist(string const& path)
+	{
+		wordlist.clear();
+
+		ifstream ifs(path);
+		if (!ifs.good()) return false;
+
+		string word;
+		while (ifs.good()) {
+			getline(ifs, word);
+			wordlist.push_back(word);
+		}
+
+		return true;
+	}
+
+	bool icase_match(string const& buf, set<int> const& possible) {
+		// We can process the remainder of the buffer now (the last word)
+		for (int i : possible) {
+			// Must be same length
+			if (buf.size() != wordlist[i].size()) continue;
+
+			// Check for full match between buffer and wordlist possibility
+			bool match = true;
+			for (int j = 0; j < buf.size(); ++j) {
+				if (tolower(buf[j]) != tolower(wordlist[i][j])) {
+					match = false;
+					break;
+				}
+			}
+
+			if (match) return true;
+		}
+
+		return false;
+	}
+
+	string cleanse(string const& str)
+	{
+		state st = WaitForWord;
+
+		string ret, buf;
+		set<int> all_possible, possible;
+
+		for (int i = 0; i < wordlist.size(); ++i)
+			all_possible.insert(i);
+
+		// Walk once through input string
+		for (size_t idx = 0; idx < str.size(); ++idx) {
+
+			char c = str[idx];
+
+			// Is this character a space or punctuation or are we at eof?
+			bool iss = isspace(c);
+			bool isp = ispunct(c);
+
+			// Still waiting for start of word...
+			if (iss && (st == WaitForWord)) {
+				ret += c;
+				continue;
+			}
+
+			// Starting a word!
+			if (!iss && (st == WaitForWord)) {
+				st = ProcessingWord;
+				possible = all_possible;
+			}
+
+			// Processing a word...
+			if (st == ProcessingWord) {
+
+				// If no more possibilities remain
+				// then we can add what we have in the buffer to the output
+				if (possible.empty()) {
+					buf += c;
+
+					// We can clear out the buffer and start over
+					if (iss || isp) {
+						ret += buf;
+						buf.clear();
+						st = WaitForWord;
+					}
+
+					continue;
+				}
+
+				// We've encountered a new space, punctuation, or eof character, so we process this word
+				if (iss || isp) {
+
+					// We match a possible word, so replace our buffer
+					if (icase_match(buf, possible)) {
+						fill(buf.begin(), buf.end(), '*');
+					}
+
+					buf += c;
+					ret += buf;
+					buf.clear();
+					st = WaitForWord;
+					continue;
+				}
+
+				// Don't yet have a match, so add the character
+				buf += c;
+
+				// Filter remaining possibilities
+				for (auto it = possible.begin(); it != possible.end();) {
+					// Current string is too long for match
+					if (wordlist[*it].size() < buf.size()) {
+						it = possible.erase(it);
+						continue;
+					}
+
+					// Current character doesn't match (in lowercase)
+					if (tolower(wordlist[*it][buf.size()-1]) != tolower(buf[buf.size()-1])) {
+						it = possible.erase(it);
+						continue;
+					}
+
+					// Next possible word
+					++it;
+				}
+			}
+		}
+
+		// Process the last word
+		if (icase_match(buf, possible)) {
+			fill(buf.begin(), buf.end(), '*');
+		}
+
+		ret += buf;
+
+		return ret;
+	}
+}
diff --git a/cppure.h b/cppure.h
@@ -0,0 +1,7 @@
+#include <string>
+
+namespace cppure
+{
+	bool load_wordlist(std::string const& path);
+	std::string cleanse(std::string const& str);
+}
diff --git a/tests.cpp b/tests.cpp
@@ -0,0 +1,24 @@
+#include "cppure.h"
+
+#include <cassert>
+
+int main(int argc, char const* argv[])
+{
+	assert(cppure::load_wordlist("wordlist.txt"));
+	assert(cppure::cleanse("ass") == "***");
+	assert(cppure::cleanse("a.s.s") == "a.s.s");
+	assert(cppure::cleanse("ASS") == "***");
+	assert(cppure::cleanse("Ass") == "***");
+	assert(cppure::cleanse("aSS") == "***");
+	assert(cppure::cleanse(" ass") == " ***");
+	assert(cppure::cleanse("  ass") == "  ***");
+	assert(cppure::cleanse("ass.") == "***.");
+	assert(cppure::cleanse("ass. ") == "***. ");
+	assert(cppure::cleanse(" ass. ") == " ***. ");
+	assert(cppure::cleanse(".ass.") == ".***.");
+	assert(cppure::cleanse("ass!") == "***!");
+	assert(cppure::cleanse("you ass") == "you ***");
+	assert(cppure::cleanse("tit") == "***");
+	assert(cppure::cleanse("title") == "title");
+	assert(cppure::cleanse("ass tit") == "*** ***");
+}
diff --git a/wordlist.txt b/wordlist.txt
@@ -0,0 +1,30 @@
+ass
+asses
+cock
+cocks
+cunt
+cunts
+dick
+dicks
+fag
+fags
+fuck
+fucks
+fucker
+fuckers
+gay
+gays
+homo
+homos
+nigger
+niggers
+pussy
+pussies
+shit
+shits
+tit
+tits
+twat
+twats
+whore
+whores

-Original file line number
+Diff line change
@@ @@ -0,0 +1,30 @@ @@
 +ass
 +asses
 +cock
 +cocks
 +cunt
 +cunts
 +dick
 +dicks
 +fag
 +fags
 +fuck
 +fucks
 +fucker
 +fuckers
 +gay
 +gays
 +homo
 +homos
 +nigger
 +niggers
 +pussy
 +pussies
 +shit
 +shits
 +tit
 +tits
 +twat
 +twats
 +whore
 +whores