123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236 |
- function getHashtagRegexStringChars(): Readonly<{
- alpha: string
- alphanumeric: string
- leftChars: string
- rightChars: string
- }> {
- // Latin accented characters
- // Excludes 0xd7 from the range
- // (the multiplication sign, confusable with "x").
- // Also excludes 0xf7, the division sign
- const latinAccents
- = '\xC0-\xD6'
- + '\xD8-\xF6'
- + '\xF8-\xFF'
- + '\u0100-\u024F'
- + '\u0253-\u0254'
- + '\u0256-\u0257'
- + '\u0259'
- + '\u025B'
- + '\u0263'
- + '\u0268'
- + '\u026F'
- + '\u0272'
- + '\u0289'
- + '\u028B'
- + '\u02BB'
- + '\u0300-\u036F'
- + '\u1E00-\u1EFF'
- // Cyrillic (Russian, Ukrainian, etc.)
- const nonLatinChars
- = '\u0400-\u04FF' // Cyrillic
- + '\u0500-\u0527' // Cyrillic Supplement
- + '\u2DE0-\u2DFF' // Cyrillic Extended A
- + '\uA640-\uA69F' // Cyrillic Extended B
- + '\u0591-\u05BF' // Hebrew
- + '\u05C1-\u05C2'
- + '\u05C4-\u05C5'
- + '\u05C7'
- + '\u05D0-\u05EA'
- + '\u05F0-\u05F4'
- + '\uFB12-\uFB28' // Hebrew Presentation Forms
- + '\uFB2A-\uFB36'
- + '\uFB38-\uFB3C'
- + '\uFB3E'
- + '\uFB40-\uFB41'
- + '\uFB43-\uFB44'
- + '\uFB46-\uFB4F'
- + '\u0610-\u061A' // Arabic
- + '\u0620-\u065F'
- + '\u066E-\u06D3'
- + '\u06D5-\u06DC'
- + '\u06DE-\u06E8'
- + '\u06EA-\u06EF'
- + '\u06FA-\u06FC'
- + '\u06FF'
- + '\u0750-\u077F' // Arabic Supplement
- + '\u08A0' // Arabic Extended A
- + '\u08A2-\u08AC'
- + '\u08E4-\u08FE'
- + '\uFB50-\uFBB1' // Arabic Pres. Forms A
- + '\uFBD3-\uFD3D'
- + '\uFD50-\uFD8F'
- + '\uFD92-\uFDC7'
- + '\uFDF0-\uFDFB'
- + '\uFE70-\uFE74' // Arabic Pres. Forms B
- + '\uFE76-\uFEFC'
- + '\u200C-\u200C' // Zero-Width Non-Joiner
- + '\u0E01-\u0E3A' // Thai
- + '\u0E40-\u0E4E' // Hangul (Korean)
- + '\u1100-\u11FF' // Hangul Jamo
- + '\u3130-\u3185' // Hangul Compatibility Jamo
- + '\uA960-\uA97F' // Hangul Jamo Extended-A
- + '\uAC00-\uD7AF' // Hangul Syllables
- + '\uD7B0-\uD7FF' // Hangul Jamo Extended-B
- + '\uFFA1-\uFFDC' // Half-width Hangul
- const charCode = String.fromCharCode
- const cjkChars
- = '\u30A1-\u30FA\u30FC-\u30FE' // Katakana (full-width)
- + '\uFF66-\uFF9F' // Katakana (half-width)
- + '\uFF10-\uFF19\uFF21-\uFF3A'
- + '\uFF41-\uFF5A' // Latin (full-width)
- + '\u3041-\u3096\u3099-\u309E' // Hiragana
- + '\u3400-\u4DBF' // Kanji (CJK Extension A)
- + `\u4E00-\u9FFF${// Kanji (Unified)
- // Disabled as it breaks the Regex.
- // charCode(0x20000) + '-' + charCode(0x2A6DF) + // Kanji (CJK Extension B)
- charCode(0x2A700)
- }-${
- charCode(0x2B73F) // Kanji (CJK Extension C)
- }${charCode(0x2B740)
- }-${
- charCode(0x2B81F) // Kanji (CJK Extension D)
- }${charCode(0x2F800)
- }-${
- charCode(0x2FA1F)
- }\u3003\u3005\u303B` // Kanji (CJK supplement)
- const otherChars = latinAccents + nonLatinChars + cjkChars
- // equivalent of \p{L}
- const unicodeLetters
- = '\u0041-\u005A\u0061-\u007A\u00AA\u00B5\u00BA\u00C0-\u00D6\u00D8-\u00F6'
- + '\u00F8-\u0241\u0250-\u02C1\u02C6-\u02D1\u02E0-\u02E4\u02EE\u037A\u0386'
- + '\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03F5\u03F7-\u0481'
- + '\u048A-\u04CE\u04D0-\u04F9\u0500-\u050F\u0531-\u0556\u0559\u0561-\u0587'
- + '\u05D0-\u05EA\u05F0-\u05F2\u0621-\u063A\u0640-\u064A\u066E-\u066F'
- + '\u0671-\u06D3\u06D5\u06E5-\u06E6\u06EE-\u06EF\u06FA-\u06FC\u06FF\u0710'
- + '\u0712-\u072F\u074D-\u076D\u0780-\u07A5\u07B1\u0904-\u0939\u093D\u0950'
- + '\u0958-\u0961\u097D\u0985-\u098C\u098F-\u0990\u0993-\u09A8\u09AA-\u09B0'
- + '\u09B2\u09B6-\u09B9\u09BD\u09CE\u09DC-\u09DD\u09DF-\u09E1\u09F0-\u09F1'
- + '\u0A05-\u0A0A\u0A0F-\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32-\u0A33'
- + '\u0A35-\u0A36\u0A38-\u0A39\u0A59-\u0A5C\u0A5E\u0A72-\u0A74\u0A85-\u0A8D'
- + '\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2-\u0AB3\u0AB5-\u0AB9\u0ABD'
- + '\u0AD0\u0AE0-\u0AE1\u0B05-\u0B0C\u0B0F-\u0B10\u0B13-\u0B28\u0B2A-\u0B30'
- + '\u0B32-\u0B33\u0B35-\u0B39\u0B3D\u0B5C-\u0B5D\u0B5F-\u0B61\u0B71\u0B83'
- + '\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99-\u0B9A\u0B9C\u0B9E-\u0B9F'
- + '\u0BA3-\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB9\u0C05-\u0C0C\u0C0E-\u0C10'
- + '\u0C12-\u0C28\u0C2A-\u0C33\u0C35-\u0C39\u0C60-\u0C61\u0C85-\u0C8C'
- + '\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBD\u0CDE'
- + '\u0CE0-\u0CE1\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D28\u0D2A-\u0D39'
- + '\u0D60-\u0D61\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6'
- + '\u0E01-\u0E30\u0E32-\u0E33\u0E40-\u0E46\u0E81-\u0E82\u0E84\u0E87-\u0E88'
- + '\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F\u0EA1-\u0EA3\u0EA5\u0EA7'
- + '\u0EAA-\u0EAB\u0EAD-\u0EB0\u0EB2-\u0EB3\u0EBD\u0EC0-\u0EC4\u0EC6'
- + '\u0EDC-\u0EDD\u0F00\u0F40-\u0F47\u0F49-\u0F6A\u0F88-\u0F8B\u1000-\u1021'
- + '\u1023-\u1027\u1029-\u102A\u1050-\u1055\u10A0-\u10C5\u10D0-\u10FA\u10FC'
- + '\u1100-\u1159\u115F-\u11A2\u11A8-\u11F9\u1200-\u1248\u124A-\u124D'
- + '\u1250-\u1256\u1258\u125A-\u125D\u1260-\u1288\u128A-\u128D\u1290-\u12B0'
- + '\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2-\u12C5\u12C8-\u12D6\u12D8-\u1310'
- + '\u1312-\u1315\u1318-\u135A\u1380-\u138F\u13A0-\u13F4\u1401-\u166C'
- + '\u166F-\u1676\u1681-\u169A\u16A0-\u16EA\u1700-\u170C\u170E-\u1711'
- + '\u1720-\u1731\u1740-\u1751\u1760-\u176C\u176E-\u1770\u1780-\u17B3\u17D7'
- + '\u17DC\u1820-\u1877\u1880-\u18A8\u1900-\u191C\u1950-\u196D\u1970-\u1974'
- + '\u1980-\u19A9\u19C1-\u19C7\u1A00-\u1A16\u1D00-\u1DBF\u1E00-\u1E9B'
- + '\u1EA0-\u1EF9\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D'
- + '\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC'
- + '\u1FBE\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC'
- + '\u1FF2-\u1FF4\u1FF6-\u1FFC\u2071\u207F\u2090-\u2094\u2102\u2107'
- + '\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D'
- + '\u212F-\u2131\u2133-\u2139\u213C-\u213F\u2145-\u2149\u2C00-\u2C2E'
- + '\u2C30-\u2C5E\u2C80-\u2CE4\u2D00-\u2D25\u2D30-\u2D65\u2D6F\u2D80-\u2D96'
- + '\u2DA0-\u2DA6\u2DA8-\u2DAE\u2DB0-\u2DB6\u2DB8-\u2DBE\u2DC0-\u2DC6'
- + '\u2DC8-\u2DCE\u2DD0-\u2DD6\u2DD8-\u2DDE\u3005-\u3006\u3031-\u3035'
- + '\u303B-\u303C\u3041-\u3096\u309D-\u309F\u30A1-\u30FA\u30FC-\u30FF'
- + '\u3105-\u312C\u3131-\u318E\u31A0-\u31B7\u31F0-\u31FF\u3400-\u4DB5'
- + '\u4E00-\u9FBB\uA000-\uA48C\uA800-\uA801\uA803-\uA805\uA807-\uA80A'
- + '\uA80C-\uA822\uAC00-\uD7A3\uF900-\uFA2D\uFA30-\uFA6A\uFA70-\uFAD9'
- + '\uFB00-\uFB06\uFB13-\uFB17\uFB1D\uFB1F-\uFB28\uFB2A-\uFB36\uFB38-\uFB3C'
- + '\uFB3E\uFB40-\uFB41\uFB43-\uFB44\uFB46-\uFBB1\uFBD3-\uFD3D\uFD50-\uFD8F'
- + '\uFD92-\uFDC7\uFDF0-\uFDFB\uFE70-\uFE74\uFE76-\uFEFC\uFF21-\uFF3A'
- + '\uFF41-\uFF5A\uFF66-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7'
- + '\uFFDA-\uFFDC'
- // equivalent of \p{Mn}\p{Mc}
- const unicodeAccents
- = '\u0300-\u036F\u0483-\u0486\u0591-\u05B9\u05BB-\u05BD\u05BF'
- + '\u05C1-\u05C2\u05C4-\u05C5\u05C7\u0610-\u0615\u064B-\u065E\u0670'
- + '\u06D6-\u06DC\u06DF-\u06E4\u06E7-\u06E8\u06EA-\u06ED\u0711\u0730-\u074A'
- + '\u07A6-\u07B0\u0901-\u0903\u093C\u093E-\u094D\u0951-\u0954\u0962-\u0963'
- + '\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7-\u09C8\u09CB-\u09CD\u09D7'
- + '\u09E2-\u09E3\u0A01-\u0A03\u0A3C\u0A3E-\u0A42\u0A47-\u0A48\u0A4B-\u0A4D'
- + '\u0A70-\u0A71\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD'
- + '\u0AE2-\u0AE3\u0B01-\u0B03\u0B3C\u0B3E-\u0B43\u0B47-\u0B48\u0B4B-\u0B4D'
- + '\u0B56-\u0B57\u0B82\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7'
- + '\u0C01-\u0C03\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55-\u0C56'
- + '\u0C82-\u0C83\u0CBC\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5-\u0CD6'
- + '\u0D02-\u0D03\u0D3E-\u0D43\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0D82-\u0D83'
- + '\u0DCA\u0DCF-\u0DD4\u0DD6\u0DD8-\u0DDF\u0DF2-\u0DF3\u0E31\u0E34-\u0E3A'
- + '\u0E47-\u0E4E\u0EB1\u0EB4-\u0EB9\u0EBB-\u0EBC\u0EC8-\u0ECD\u0F18-\u0F19'
- + '\u0F35\u0F37\u0F39\u0F3E-\u0F3F\u0F71-\u0F84\u0F86-\u0F87\u0F90-\u0F97'
- + '\u0F99-\u0FBC\u0FC6\u102C-\u1032\u1036-\u1039\u1056-\u1059\u135F'
- + '\u1712-\u1714\u1732-\u1734\u1752-\u1753\u1772-\u1773\u17B6-\u17D3\u17DD'
- + '\u180B-\u180D\u18A9\u1920-\u192B\u1930-\u193B\u19B0-\u19C0\u19C8-\u19C9'
- + '\u1A17-\u1A1B\u1DC0-\u1DC3\u20D0-\u20DC\u20E1\u20E5-\u20EB\u302A-\u302F'
- + '\u3099-\u309A\uA802\uA806\uA80B\uA823-\uA827\uFB1E\uFE00-\uFE0F'
- + '\uFE20-\uFE23'
- // equivalent of \p{Dn}
- const unicodeDigits
- = '\u0030-\u0039\u0660-\u0669\u06F0-\u06F9\u0966-\u096F\u09E6-\u09EF'
- + '\u0A66-\u0A6F\u0AE6-\u0AEF\u0B66-\u0B6F\u0BE6-\u0BEF\u0C66-\u0C6F'
- + '\u0CE6-\u0CEF\u0D66-\u0D6F\u0E50-\u0E59\u0ED0-\u0ED9\u0F20-\u0F29'
- + '\u1040-\u1049\u17E0-\u17E9\u1810-\u1819\u1946-\u194F\u19D0-\u19D9'
- + '\uFF10-\uFF19'
- // An alpha char is a unicode chars including unicode marks or
- // letter or char in otherChars range
- const alpha = unicodeLetters
- // A numeric character is any with the number digit property, or
- // underscore. These characters can be included in hashtags, but a hashtag
- // cannot have only these characters.
- const numeric = `${unicodeDigits}_`
- // Alphanumeric char is any alpha char or a unicode char with decimal
- // number property \p{Nd}
- const alphanumeric = alpha + numeric
- const leftChars = '{'
- const rightChars = '}'
- return {
- alpha,
- alphanumeric,
- leftChars,
- rightChars,
- }
- }
- export function getHashtagRegexString(): string {
- const { alpha, alphanumeric, leftChars, rightChars } = getHashtagRegexStringChars()
- const hashtagAlpha = `[${alpha}]`
- const hashtagAlphanumeric = `[${alphanumeric}]`
- const hashLeftCharList = `[${leftChars}]`
- const hashRightCharList = `[${rightChars}]`
- // A hashtag contains characters, numbers and underscores,
- // but not all numbers.
- const hashtag
- = `(${
- hashLeftCharList
- })`
- + `(${
- hashLeftCharList
- })([a-zA-Z_][a-zA-Z0-9_]{0,29}`
- + `)(${
- hashRightCharList
- })(${
- hashRightCharList
- })`
- return hashtag
- }
|