Skip to content

Commit 3587996

Browse files
committed
Add regexp to match an unpaired UTF-16 surrogate
1 parent 6ca1b83 commit 3587996

File tree

7 files changed

+289
-0
lines changed

7 files changed

+289
-0
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# UTF-16 Unpaired Surrogate
2+
3+
> [Regular expression][mdn-regexp] to match an unpaired [UTF-16][utf-16] surrogate.
4+
5+
6+
<section class="usage">
7+
8+
## Usage
9+
10+
``` javascript
11+
var RE_UTF16_UNPAIRED_SURROGATE = require( '@stdlib/regexp/utf16-unpaired-surrogate' );
12+
```
13+
14+
#### RE_UTF16_UNPAIRED_SURROGATE
15+
16+
[Regular expression][mdn-regexp] to match an unpaired [UTF-16][utf-16] surrogate.
17+
18+
``` javascript
19+
var bool = RE_UTF16_UNPAIRED_SURROGATE.test( 'abc\uD800def' );
20+
// returns true
21+
```
22+
23+
</section>
24+
25+
<!-- /.usage -->
26+
27+
28+
<section class="examples">
29+
30+
## Examples
31+
32+
``` javascript
33+
var RE_UTF16_UNPAIRED_SURROGATE = require( '@stdlib/regexp/utf16-unpaired-surrogate' );
34+
35+
var bool = RE_UTF16_UNPAIRED_SURROGATE.test( '\uD800' );
36+
// returns true
37+
38+
bool = RE_UTF16_UNPAIRED_SURROGATE.test( '\uDC00' );
39+
// returns true
40+
41+
bool = RE_UTF16_UNPAIRED_SURROGATE.test( 'abc' );
42+
// returns false
43+
```
44+
45+
</section>
46+
47+
<!-- /.examples -->
48+
49+
50+
<section class="links">
51+
52+
[mdn-regexp]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions
53+
[utf-16]: https://en.wikipedia.org/wiki/UTF-16
54+
55+
</section>
56+
57+
<!-- /.links -->
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
'use strict';
2+
3+
// MODULES //
4+
5+
var bench = require( '@stdlib/bench' );
6+
var isBoolean = require( '@stdlib/assert/is-boolean' ).isPrimitive;
7+
var fromCodePoint = require( '@stdlib/string/from-code-point' );
8+
var pkg = require( './../package.json' ).name;
9+
var RE_UTF16_UNPAIRED_SURROGATE = require( './../lib' ); // eslint-disable-line id-length
10+
11+
12+
// MAIN //
13+
14+
bench( pkg, function benchmark( b ) {
15+
var bool;
16+
var str;
17+
var i;
18+
19+
b.tic();
20+
for ( i = 0; i < b.iterations; i++ ) {
21+
str = 'beep boop\r\n'+fromCodePoint( 97 + (i%26) )+'\r\nfoo bar';
22+
bool = RE_UTF16_UNPAIRED_SURROGATE.test( str );
23+
if ( !isBoolean( bool ) ) {
24+
b.fail( 'should return a boolean' );
25+
}
26+
}
27+
b.toc();
28+
if ( !isBoolean( bool ) ) {
29+
b.fail( 'should return a boolean' );
30+
}
31+
b.pass( 'benchmark finished' );
32+
b.end();
33+
});
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
2+
{{alias}}
3+
Regular expression to match an unpaired UTF-16 surrogate.
4+
5+
Examples
6+
--------
7+
> var bool = {{alias}}.test( 'abc' )
8+
false
9+
> bool = {{alias}}.test( '\uD800' )
10+
true
11+
12+
See Also
13+
--------
14+
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
'use strict';
2+
3+
var RE_UTF16_UNPAIRED_SURROGATE = require( './../lib' ); // eslint-line-disable id-length
4+
5+
console.log( RE_UTF16_UNPAIRED_SURROGATE.test( '\uD800' ) );
6+
// => true
7+
8+
console.log( RE_UTF16_UNPAIRED_SURROGATE.test( '\uDC00' ) );
9+
// => true
10+
11+
console.log( RE_UTF16_UNPAIRED_SURROGATE.test( 'abc' ) );
12+
// => false
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
'use strict';
2+
3+
/**
4+
* Regular expression to match a UTF-16 unpaired surrogate.
5+
*
6+
* @module @stdlib/regexp/utf16-unpaired-surrogate
7+
* @type {RegExp}
8+
*
9+
* @example
10+
* var RE_UTF16_UNPAIRED_SURROGATE = require( '@stdlib/regexp/utf16-unpaired-surrogate' );
11+
*
12+
* var bool = RE_UTF16_UNPAIRED_SURROGATE.test( '\uD800' );
13+
* // returns true
14+
*
15+
* bool = RE_UTF16_UNPAIRED_SURROGATE.test( '\uDC00' );
16+
* // returns true
17+
*
18+
* bool = RE_UTF16_UNPAIRED_SURROGATE.test( 'abc' );
19+
* // returns false
20+
*/
21+
22+
23+
// MAIN //
24+
25+
/**
26+
* Matches an unpaired UTF-16 surrogate.
27+
*
28+
* Regular expression: `/(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])/`
29+
*
30+
* * `(?:[^\uD800-\uDBFF]|^)`
31+
* - capture but do not remember anything which is not a high surrogate, including nothing
32+
* * `[\uDC00-\uDFFF]`
33+
* - match a low surrogate
34+
* * `|`
35+
* - OR
36+
* * `[\uD800-\uDBFF]`
37+
* - match a high surrogate
38+
* * `(?![\uDC00-\uDFFF])`
39+
* - but only accept the previous match if not followed by a low surrogate
40+
*
41+
*
42+
* @constant
43+
* @type {RegExp}
44+
* @default /(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])/
45+
*/
46+
var RE_UTF16_UNPAIRED_SURROGATE = /(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])/; // eslint-disable-line id-length
47+
48+
49+
// EXPORTS //
50+
51+
module.exports = RE_UTF16_UNPAIRED_SURROGATE;
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
{
2+
"name": "@stdlib/regexp/utf16-unpaired-surrogate",
3+
"version": "0.0.0",
4+
"description": "Regular expression to match an unpaired UTF-16 surrogate.",
5+
"author": {
6+
"name": "The Stdlib Authors",
7+
"url": "https://github.com/stdlib-js/stdlib/graphs/contributors"
8+
},
9+
"contributors": [
10+
{
11+
"name": "The Stdlib Authors",
12+
"url": "https://github.com/stdlib-js/stdlib/graphs/contributors"
13+
}
14+
],
15+
"scripts": {},
16+
"main": "./lib",
17+
"repository": {
18+
"type": "git",
19+
"url": "git://github.com/stdlib-js/stdlib.git"
20+
},
21+
"homepage": "https://github.com/stdlib-js/stdlib",
22+
"keywords": [
23+
"stdlib",
24+
"regex",
25+
"regexp",
26+
"re",
27+
"utf-16",
28+
"utf16",
29+
"unicode",
30+
"surrogate",
31+
"high",
32+
"low",
33+
"pair",
34+
"regular",
35+
"expression",
36+
"capture",
37+
"match"
38+
],
39+
"bugs": {
40+
"url": "https://github.com/stdlib-js/stdlib/issues"
41+
},
42+
"dependencies": {},
43+
"devDependencies": {},
44+
"engines": {
45+
"node": ">=0.10.0",
46+
"npm": ">2.7.0"
47+
},
48+
"license": "Apache-2.0"
49+
}
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
'use strict';
2+
3+
// MODULES //
4+
5+
var tape = require( 'tape' );
6+
var RE = require( './../lib' );
7+
8+
9+
// TESTS //
10+
11+
tape( 'main export is a regular expression', function test( t ) {
12+
t.ok( true, __filename );
13+
t.strictEqual( RE instanceof RegExp, true, 'main export is a regular expression' );
14+
t.end();
15+
});
16+
17+
tape( 'the regular expression matches an unpaired UTF-16 surrogate', function test( t ) {
18+
var values;
19+
var i;
20+
21+
values = [
22+
'\uD800',
23+
'\uD801',
24+
'\uD802',
25+
'\uDBFF',
26+
'\uDC00',
27+
'\uDC01',
28+
'\uDFFE',
29+
'\uDFFF',
30+
'abc\uD800abc',
31+
'abc\uDFFFabc'
32+
];
33+
34+
for ( i = 0; i < values.length; i++ ) {
35+
t.strictEqual( RE.test( values[ i ] ), true, 'matches when provided '+values[i] );
36+
}
37+
t.end();
38+
});
39+
40+
tape( 'the regular expression does not match surrogate pairs', function test( t ) {
41+
var values;
42+
var i;
43+
44+
values = [
45+
'\uD800\uDC00',
46+
'\uD801\uDC01',
47+
'\uDBFF\uDFFF',
48+
'abc\uD800\uDC00abc'
49+
];
50+
51+
for ( i = 0; i < values.length; i++ ) {
52+
t.strictEqual( RE.test( values[ i ] ), false, 'does not match when provided '+values[i] );
53+
}
54+
t.end();
55+
});
56+
57+
tape( 'the regular expression does not match non-surrogates', function test( t ) {
58+
var values;
59+
var i;
60+
61+
values = [
62+
'a',
63+
'b',
64+
'c',
65+
'abc',
66+
'defgihjk'
67+
];
68+
69+
for ( i = 0; i < values.length; i++ ) {
70+
t.strictEqual( RE.test( values[ i ] ), false, 'does not match when provided '+values[i] );
71+
}
72+
t.end();
73+
});

0 commit comments

Comments
 (0)