.NET, Java, Perl, PHP, Python3(?) all support PCRE's \p{L}
regex that matches unicode character representing a letter, but there is no such a shortcut in JavaScript (as far as I know)... I'm working on a library focused on string manipulations, and I badly need the equivalent for JavaScript. So far I've got the 1172 characters long regex bellow, built in a rather clunky way. I would appreciate if someone could confirm/deny if got it right, or better, how to make it more general and accurate. Here's the way I've got it, top to bottom:
//
// JavaScript synonym for (.NET/Java/Perl/PCRE)'s `\p{L}` regexp.
// get range of characters
function crange (a, z) {
var rng = [];
if (a <= z) {
for (
var cc = a.charCodeAt(0) - 1,
stop = z.charCodeAt(0),
fromcc = String.fromCharCode;
++cc <= stop;
rng.push(fromcc(cc))
);
}
return rng;
}
// maps a list to another 2D-list
// containing arrays with successive integers in it
Array.prototype.intranges = function () {
for (
var it = 0,
// filter unique and numericaly sorted
// list of integers from given array
self = this.filter(_intranges).sort(_nsort),
len = self.length,
res = [],
buff,
curr;
buff = curr = self[it], it < len;
// ignore update
) {
// increment while integers are successive
while (self[(++it)] == (++buff));
// save
res.push(
(self[(self.indexOf(curr) + 1)] == self[it]) ?
[curr] : [curr, self[(it - 1)]]
);
}
return res;
};
var letter_regex =
// get all characters < 0xffff;
crange('\u0000', '\uffff')
// create [(int) codePoint, (char) character] pairs
.map(function (c, i) {
return [i, c];
})
// this one is tricky...
// what holds true for a character
// that is regular letter,
// not punctuation, whitespace, number,
// or any other (fancy) unicode symbol?
//
// I'm sure this part can be improved.
// It checks if a character has it's
// lower/upper-case version,
// assuming it's true for letters only...
.filter(function (pair) {
var p1 = pair[1];
return p1.toUpperCase() != p1.toLowerCase();
})
// fetch those code-points
.map(function (pair) {
return pair[0];
})
// build integer subranges out of them
.intranges()
// build a string out of it
// that can be used by `RegExp`
.map(function (ccrange) {
return ccrange.map(function (cc) {
var c = cc.toString(16);
return (cc <= 0xff) ? ('\\x' + pad02(c)) : ('\\u' + pad04(c));
}).join('-');
})
.join('');
//
//
// and it generated this (10ft) long string:
//
// letter_regex = '\x41-\x5A\x61-\x7A\xB5\xC0-\xD6\xD8-\xDE\xE0-\xF6\xF8-\u0137\u0139-\u0148\u014A-\u018C\u018E-\u019A\u019C-\u01A9\u01AC-\u01B9\u01BC-\u01BD\u01BF\u01C4-\u01EF\u01F1-\u0220\u0222-\u0233\u023A-\u0254\u0256-\u0257\u0259\u025B\u0260\u0263\u0265-\u0266\u0268-\u0269\u026B\u026F\u0271-\u0272\u0275\u027D\u0280\u0283\u0288-\u028C\u0292\u0345\u0370-\u0373\u0376-\u0377\u037B-\u037D\u0386\u0388-\u038A\u038C\u038E-\u038F\u0391-\u03A1\u03A3-\u03AF\u03B1-\u03D1\u03D5-\u03F2\u03F4-\u03F5\u03F7-\u03FB\u03FD-\u0481\u048A-\u0527\u0531-\u0556\u0561-\u0586\u10A0-\u10C5\u10C7\u10CD\u1D79\u1D7D\u1E00-\u1E95\u1E9B\u1E9E\u1EA0-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F51\u1F53\u1F55\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB1\u1FB3\u1FB8-\u1FBC\u1FBE\u1FC3\u1FC8-\u1FCC\u1FD0-\u1FD1\u1FD8-\u1FDB\u1FE0-\u1FE1\u1FE5\u1FE8-\u1FEC\u1FF3\u1FF8-\u1FFC\u2126\u212A-\u212B\u2132\u214E\u2160-\u217F\u2183-\u2184\u24B6-\u24E9\u2C00-\u2C2E\u2C30-\u2C5E\u2C60-\u2C70\u2C72-\u2C73\u2C75-\u2C76\u2C7E-\u2CE3\u2CEB-\u2CEE\u2CF2-\u2CF3\u2D00-\u2D25\u2D27\u2D2D\uA640-\uA66D\uA680-\uA697\uA722-\uA72F\uA732-\uA76F\uA779-\uA787\uA78B-\uA78D\uA790-\uA793\uA7A0-\uA7AA\uFF21-\uFF3A\uFF41-\uFF5A';
//
//
function pad02 (c) {
return (Array(3).slice(c.length).join('0') + c).toUpperCase();
}
function pad04 (c) {
return (Array(5).slice(c.length).join('0') + c).toUpperCase();
}
// filter out unique integers
function _intranges (node, pos, self) {
return _isint(node) && (pos <= self.indexOf(node));
}
function _isint (n) {
return (n | 0) === n;
}
function _nsort (n1, n2) {
return n1 - n2;
}
// /eof
I think I have the regex equiv. for
\p{L}
, I've used BabelMap app to generate it. It covers 48k+ letter characters in{Ll, Lm, Lo, Lt, Lu}
sets:I've posted the version that includes code points
> 0xffff
and characters it matches here, (it's to much text to dump in single SO post).The below javascript solution catch the most common cases: