forked from RooCodeInc/Roo-Code
-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathtext-normalization.ts
More file actions
95 lines (85 loc) · 2.44 KB
/
text-normalization.ts
File metadata and controls
95 lines (85 loc) · 2.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
/**
* Common character mappings for normalization
*/
export const NORMALIZATION_MAPS = {
// Smart quotes to regular quotes
SMART_QUOTES: {
"\u201C": '"', // Left double quote (U+201C)
"\u201D": '"', // Right double quote (U+201D)
"\u2018": "'", // Left single quote (U+2018)
"\u2019": "'", // Right single quote (U+2019)
},
// Other typographic characters
TYPOGRAPHIC: {
"\u2026": "...", // Ellipsis
"\u2014": "-", // Em dash
"\u2013": "-", // En dash
"\u00A0": " ", // Non-breaking space
},
}
/**
* Options for string normalization
*/
export interface NormalizeOptions {
smartQuotes?: boolean // Replace smart quotes with straight quotes
typographicChars?: boolean // Replace typographic characters
extraWhitespace?: boolean // Collapse multiple whitespace to single space
trim?: boolean // Trim whitespace from start and end
}
/**
* Default options for normalization
*/
const DEFAULT_OPTIONS: NormalizeOptions = {
smartQuotes: true,
typographicChars: true,
extraWhitespace: true,
trim: true,
}
/**
* Normalizes a string based on the specified options
*
* @param str The string to normalize
* @param options Normalization options
* @returns The normalized string
*/
export function normalizeString(str: string, options: NormalizeOptions = DEFAULT_OPTIONS): string {
const opts = { ...DEFAULT_OPTIONS, ...options }
let normalized = str
// Replace smart quotes
if (opts.smartQuotes) {
for (const [smart, regular] of Object.entries(NORMALIZATION_MAPS.SMART_QUOTES)) {
normalized = normalized.replace(new RegExp(smart, "g"), regular)
}
}
// Replace typographic characters
if (opts.typographicChars) {
for (const [typographic, regular] of Object.entries(NORMALIZATION_MAPS.TYPOGRAPHIC)) {
normalized = normalized.replace(new RegExp(typographic, "g"), regular)
}
}
// Normalize whitespace
if (opts.extraWhitespace) {
normalized = normalized.replace(/\s+/g, " ")
}
// Trim whitespace
if (opts.trim) {
normalized = normalized.trim()
}
return normalized
}
/**
* Unescapes common HTML entities in a string
*
* @param text The string containing HTML entities to unescape
* @returns The unescaped string with HTML entities converted to their literal characters
*/
export function unescapeHtmlEntities(text: string): string {
if (!text) return text
return text
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/'/g, "'")
.replace(/&/g, "&")
}