-
Notifications
You must be signed in to change notification settings - Fork 354
Expand file tree
/
Copy pathMatch.cs
More file actions
798 lines (715 loc) · 44.1 KB
/
Match.cs
File metadata and controls
798 lines (715 loc) · 44.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using Microsoft.PowerFx.Core.App.ErrorContainers;
using Microsoft.PowerFx.Core.Binding;
using Microsoft.PowerFx.Core.Errors;
using Microsoft.PowerFx.Core.Functions;
using Microsoft.PowerFx.Core.Localization;
using Microsoft.PowerFx.Core.Types;
using Microsoft.PowerFx.Core.Types.Enums;
using Microsoft.PowerFx.Core.Utils;
using Microsoft.PowerFx.Syntax;
#pragma warning disable SA1402 // File may only contain a single type
#pragma warning disable SA1649 // File name should match first type name
namespace Microsoft.PowerFx.Core.Texl.Builtins
{
// IsMatch(text:s, regular_expression:s, [options:s])
internal class IsMatchFunction : BaseMatchFunction
{
public IsMatchFunction()
: base("IsMatch", TexlStrings.AboutIsMatch, DType.Boolean, null)
{
}
public override IEnumerable<TexlStrings.StringGetter[]> GetSignatures()
{
yield return new[] { TexlStrings.IsMatchArg1, TexlStrings.IsMatchArg2 };
yield return new[] { TexlStrings.IsMatchArg1, TexlStrings.IsMatchArg2, TexlStrings.IsMatchArg3 };
}
}
// Match(text:s, regular_expression:s, [options:s])
internal class MatchFunction : BaseMatchFunction
{
public MatchFunction(RegexTypeCache regexTypeCache)
: base("Match", TexlStrings.AboutMatch, DType.EmptyRecord, regexTypeCache)
{
}
}
// MatchAll(text:s, regular_expression:s, [options:s])
internal class MatchAllFunction : BaseMatchFunction
{
public MatchAllFunction(RegexTypeCache regexTypeCache)
: base("MatchAll", TexlStrings.AboutMatchAll, DType.EmptyTable, regexTypeCache)
{
}
}
internal class BaseMatchFunction : BuiltinFunction
{
private readonly ConcurrentDictionary<string, Tuple<DType, bool, bool, bool>> _regexTypeCache;
private readonly string _cachePrefix;
private readonly int _regexCacheSize;
public override bool IsSelfContained => true;
public override bool SupportsParamCoercion => true;
public override bool UseParentScopeForArgumentSuggestions => true;
public BaseMatchFunction(string functionName, TexlStrings.StringGetter aboutGetter, DType returnType, RegexTypeCache regexTypeCache)
: base(functionName, aboutGetter, FunctionCategories.Text, returnType, 0, 2, 3, DType.String, BuiltInEnums.MatchEnum.FormulaType._type, BuiltInEnums.MatchOptionsEnum.FormulaType._type)
{
if (regexTypeCache != null)
{
_cachePrefix = returnType.IsTable ? "tbl_" : "rec_";
_regexTypeCache = regexTypeCache.Cache;
_regexCacheSize = regexTypeCache.CacheSize;
}
}
public override IEnumerable<TexlStrings.StringGetter[]> GetSignatures()
{
yield return new[] { TexlStrings.MatchArg1, TexlStrings.MatchArg2 };
yield return new[] { TexlStrings.MatchArg1, TexlStrings.MatchArg2, TexlStrings.MatchArg3 };
}
public override IEnumerable<string> GetRequiredEnumNames()
{
return new List<string>() { LanguageConstants.MatchEnumString, LanguageConstants.MatchOptionsEnumString };
}
public override bool HasSuggestionsForParam(int index)
{
Contracts.Assert(index >= 0);
return index <= 2;
}
public override bool CheckTypes(CheckTypesContext context, TexlNode[] args, DType[] argTypes, IErrorContainer errors, out DType returnType, out Dictionary<TexlNode, DType> nodeToCoercedTypeMap)
{
Contracts.AssertValue(args);
Contracts.AssertAllValues(args);
Contracts.AssertValue(argTypes);
Contracts.Assert(args.Length == argTypes.Length);
Contracts.Assert(args.Length == 2 || args.Length == 3);
Contracts.AssertValue(errors);
bool fValid = base.CheckTypes(context, args, argTypes, errors, out returnType, out nodeToCoercedTypeMap);
Contracts.Assert(returnType.IsRecord || returnType.IsTable || returnType == DType.Boolean);
string regularExpressionOptions = string.Empty;
var regExNode = args[1];
if ((argTypes[1].Kind != DKind.String && argTypes[1].Kind != DKind.OptionSetValue) || !BinderUtils.TryGetConstantValue(context, regExNode, out var regularExpression))
{
errors.EnsureError(regExNode, TexlStrings.ErrVariableRegEx);
return false;
}
if (context.Features.PowerFxV1CompatibilityRules && args.Length == 3 &&
((argTypes[2].Kind != DKind.String && argTypes[2].Kind != DKind.OptionSetValue) || !BinderUtils.TryGetConstantValue(context, args[2], out regularExpressionOptions)))
{
errors.EnsureError(args[2], TexlStrings.ErrVariableRegExOptions);
return false;
}
if (!context.Features.PowerFxV1CompatibilityRules)
{
// only used for the following analysis and type creation, not modified in the IR
regularExpressionOptions += "N";
}
string alteredOptions = regularExpressionOptions;
return fValid &&
(!context.Features.PowerFxV1CompatibilityRules || IsSupportedRegularExpression(regExNode, regularExpression, regularExpressionOptions, out alteredOptions, errors)) &&
(returnType == DType.Boolean || TryCreateReturnType(regExNode, regularExpression, alteredOptions, errors, ref returnType));
}
private static readonly IReadOnlyCollection<string> UnicodeCategories = new HashSet<string>()
{
"L", "Lu", "Ll", "Lt", "Lm", "Lo",
"M", "Mn", "Mc", "Me",
"N", "Nd", "Nl", "No",
"P", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po",
"S", "Sm", "Sc", "Sk", "So",
"Z", "Zs", "Zl", "Zp",
"Cc", "Cf",
// "C", "Cs", "Co", "Cn", are left out for now until we have a good scenario, as they differ between implementations
};
// Power Fx regular expressions are limited to features that can be transpiled to native .NET (C# Interpreter), ECMAScript (Canvas), or PCRE2 (Excel).
// We want the same results everywhere for Power Fx, even if the underlying implementation is different. Even with these limits in place there are some minor semantic differences but we get as close as we can.
// These tests can be run through all three engines and the results compared with by setting ExpressionEvaluationTests.RegExCompareEnabled, a PCRE2 DLL and NodeJS must be installed on the system.
//
// In short, we use the insersection of canonical .NET regular expressions and ECMAScript 2024's "v" flag for escaping rules.
// Someday when "v" is more widely avaialble, we can support more of its features such as set subtraction.
// We chose to use canonical .NET instead of RegexOptions.ECMAScript because we wanted the unicode definitions for words. See https://learn.microsoft.com/dotnet/standard/base-types/regular-expression-options#ecmascript-matching-behavior
//
// In addition, Power Fx regular expressions are opinionated and try to eliminate some of the ambiguity in the common regular expression language:
// Numbered capture groups are disabled by default, and cannot be mixed with named capture groups.
// Octal character codes are not supported, use \x or \u instead.
// Literal ^, -, [, ], {, and } must be escaped when used in a character class.
// Escaping is only supported for special characters and unknown alphanumeric escape sequences are not supported.
// Unicode characters are used throughout.
// Newlines support Windows friendly \r\n as well as \r and \n.
//
// Features that are supported:
// Literal characters. Any character except the special characters [ ] \ ^ $ . | ? * + ( ) can be inserted directly.
// Escaped special characters. \ (backslash) followed by a special character to insert it directly, includes \- when in a character class.
// Operators
// Dot (.), matches everything except [\r\n] unless MatchOptions.DotAll is used.
// Anchors, ^ and $, matches the beginning and end of the string, or of a line if MatchOptions.Multiline is used.
// Quanitfiers
// Greedy quantifiers. ? matches 0 or 1 times, + matches 1 or more times, * matches 0 or more times, {3} matches exactly 3 times, {1,} matches at least 1 time, {1,3} matches between 1 and 3 times. By default, matching is "greedy" and the match will be as large as possible.
// Lazy quantifiers. Same as the greedy quantifiers followed by ?, for example *? or {1,3}?. With the lazy modifier, the match will be as small as possible.
// Alternation. a|b matches "a" or "b".
// Character classes
// Custom character class. [abc] list of characters, [a-fA-f0-9] range of characters, [^a-z] everything but these characters. Character classes cannot be nested, subtracted, or intersected, and the same special character cannot be repeated in the character class.
// Word characters and breaks. \w, \W, \b, \B, using the Unicode definition of letters [\p{Ll}\p{Lu}\p{Lt}\p{Lo}\p{Nd}\p{Pc}\p{Lm}].
// Digit characters. \d includes the digits 0-9 and \p{Nd}, \D matches everything except characters matched by \d.
// Space characters. \s includes spacing characters [ \r\n\t\f\x0B\x85\p{Z}], \S which matches everything except characters matched by \s, \r carriage return, \n newline, \t tab, \f form feed.
// Control characters. \cA, where the control character is [A-Za-z].
// Hexadecimal and Unicode character codes. \x20 with two hexadecimal digits, \u2028 with four hexadecimal digits.
// Unicode character class and property. \p{Ll} matches all Unicode lowercase letters, while \P{Ll} matches everything that is not a Unicode lowercase letter.
// Capture groups
// Non capture group. (?:a), group without capturing the result as a named or numbered sub-match.
// Named group and back reference. (?<name>chars) captures a sub-match with the name name, referenced with \k<name>. Cannot be used if MatchOptions.NumberedSubMatches is enabled.
// Numbered group and back referencs. (a|b) captures a sub-match, referenced with \1. MatchOptions.NumberedSubMatches must be enabled.
// Lookahead and lookbehind. (?=a), (?!a), (?<=b), (?<!b).
// Free spacing mode. Whitepsace within the regular expression is ignored and # starts an end of line comment.
// Inline comments. (?# comment here), which is ignored as a comment. See MatchOptions.FreeSpacing for an alternative to formatting and commenting regular expressions.
// Inline mode modifiers. (?im) is the same as using MatchOptions.IgnoreCase and MatchOptions.Multiline. Must be used at the beginning of the regular expression. Supported inline modes are [imsx], corresponding to MatchOptions.IgnoreCase, MatchOptions.Multiline, MatchOptions.DotAll, and MatchOptions.FreeSpacing, respectively.
//
// Significant features that are not supported:
// Capture groups
// Numbered capture groups are disable by default, use named captures or MatchOptions.NumberedSubMatches
// Self-referncing groups, such as "(a\1)"
// Single quoted named capture groups "(?'name'..." and "\k'name'"
// Balancing capture groups
// Recursion
// Character classes
// \W, \D, \P, \S are not supported inside character classes if the character class is negated (starts with [^...])
// Use of ^, -, [, or ] without an escape inside a character class is not supported
// Character class set operations, such as subraction or intersection
// Empty character classes
// Inline options
// Turning options on or off
// Changing options later in the expression
// Setting options for a subexpression
// Conditionals
// Octal characters
// \x{...} and \u{...} notation
// Subroutines
// Possessive quantifiers
//
// In addition, the Power Fx compiler uses the .NET regular expression engine to validate the expression and determine capture group names.
// So, any regular expression that does not compile with .NET is also automatically disallowed.
private bool IsSupportedRegularExpression(TexlNode regExNode, string regexPattern, string regexOptions, out string alteredOptions, IErrorContainer errors)
{
bool freeSpacing = regexOptions.Contains("x"); // can also be set with inline mode modifier
bool numberedCpature = regexOptions.Contains("N"); // can only be set here, no inline mode modifier
alteredOptions = regexOptions;
// Scans the regular expression for interesting constructs, ignoring other elements and constructs that are legal, such as letters and numbers.
// Order of alternation is important. .NET regular expressions are greedy and will match the first of these that it can.
// Many subexpressions here take advantage of this, matching something that is valid, before falling through to check for something that is invalid.
//
// For example, consider testing "\\(\a)". This will match <goodEscape> <openParen> <badEscape> <closeParen>.
// <badEscapeAlpha> will report an error and stop further processing.
// One might think that the "\a" could have matched <goodEscape>, but it will match <badEscapeAlpha> first because it is first in the RE.
// One might think that the "\(" could have matched <goodEscape>, but the double backslashes will be consumed first, which is why it is important
// to gather all the matches in a linear scan from the beginning to the end.
//
// Three regular expressions are utilized:
// - escapeRE is a regular expression fragment that is shared by the other two, included at the beginning each of the others
// - generalRE is used outside of a character class
// - characterClassRE is used inside a character class
const string escapeRE =
@"
# leading backslash, escape sequences
\\k<(?<backRefName>\w+)> | # named backreference
(?<badOctal>\\0\d*) | # \0 and octal are not accepted, ambiguous and not needed (use \x instead)
\\(?<backRefNumber>\d+) | # numeric backreference, must be enabled with MatchOptions.NumberedSubMatches
(?<goodEscape>\\
([dfnrstw] | # standard regex character classes, missing from .NET are aAeGzZv (no XRegExp support), other common are u{} and o
[\^\$\\\.\*\+\?\(\)\[\]\{\}\|\/\#\ ] | # acceptable escaped characters with Unicode aware ECMAScript with # and space for Free Spacing
c[a-zA-Z] | # Ctrl character classes
x[0-9a-fA-F]{2} | # hex character, must be exactly 2 hex digits
u[0-9a-fA-F]{4})) | # Unicode characters, must be exactly 4 hex digits
\\(?<goodUEscape>[pP])\{(?<UCategory>[\w=:-]+)\} | # Unicode chaeracter classes, extra characters here for a better error message
(?<goodEscapeOutsideCC>\\[bB]) | # acceptable outside a character class, includes negative classes until we have character class subtraction, include \P for future MatchOptions.LocaleAware
(?<goodEscapeOutsideAndInsideCCIfPositive>\\[DWS]) |
(?<goodEscapeInsideCCOnly>\\[&\-!#%,;:<=>@`~\^]) | # https://262.ecma-international.org/#prod-ClassSetReservedPunctuator, others covered with goodEscape above
(?<badEscape>\\.) | # all other escaped characters are invalid and reserved for future use
";
var generalRE = new Regex(
escapeRE +
@"
# leading (?<, named captures
\(\?<(?<goodNamedCapture>[a-zA-Z][a-zA-Z\d]*)> | # named capture group, can only be letters and numbers and must start with a letter
(?<goodLookaround>\(\?(=|!|<=|<!)) | # lookahead and lookbehind
(?<badBalancing>\(\?<\w*-\w*>) | # .NET balancing captures are not supported
(?<badNamedCaptureName>\(\?<[^>]*>) | # bad named capture name, didn't match goodNamedCapture
(?<badSingleQuoteNamedCapture>\(\?'[^']*') | # single quoted capture names are not supported
# leading (?, misc
(?<goodNonCapture>\(\?:) | # non-capture group, still need to track to match with closing paren
\A\(\?(?<goodInlineOptions>[imnsx]+)\) | # inline options
(?<goodInlineComment>\(\?\#) | # inline comment
(?<badInlineOptions>\(\?(\w+|\w*-\w+)[\:\)]) | # inline options, including disable of options
(?<badConditional>\(\?\() | # .NET conditional alternations are not supported
# leading (, used for other special purposes
(?<badParen>\([\?\+\*].?) | # everything else unsupported that could start with a (, includes atomic groups, recursion, subroutines, branch reset, and future features
# leading ?\*\+, quantifiers
(?<badQuantifiers>[\?\*\+][\+\*]) | # possessive (ends with +) and useless quantifiers (ends with *)
(?<goodQuantifiers>[\?\*\+]\??) | # greedy and lazy quantifiers
# leading {, limited quantifiers
(?<badExact>{\d+}[\+\*\?]) | # exact quantifier can't be used with a modifier
(?<goodExact>{\d+}) | # standard exact quantifier, no optional lazy
(?<badLimited>{\d+,\d*}[\+|\*]) | # possessive and useless quantifiers
(?<goodLimited>{\d+,\d*}\??) | # standard limited quantifiers, with optional lazy
(?<badCurly>[{}]) | # more constrained, blocks {,3} and Java/Rust semantics that does not treat this as a literal
# character class
(?<badEmptyCharacterClass>\[\]|\[^\]) | # some implementations support empty character class, with varying semantics; we do not
\[(?<characterClass>(\\\]|\\\[|[^\]\[])+)\] | # does not accept empty character class
(?<badSquareBrackets>[\[\]]) | # square brackets that are not escaped and didn't define a character class
# open and close regions
(?<openParen>\() |
(?<closeParen>\)) |
(?<poundComment>\#) | # used in free spacing mode (to detect start of comment), ignored otherwise
(?<newline>[\r\n]) # used in free spacing mode (to detect end of comment), ignored otherwise
", RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
var characterClassRE = new Regex(
escapeRE +
@"
(?<badHyphen>^-|-$) | # begin/end literal hyphen not allowed within character class, needs to be escaped (ECMAScript v)
(?<badInCharClass> \/ | \| | \\ | # https://262.ecma-international.org/#prod-ClassSetSyntaxCharacter
\{ | \} | \( | \) | \[ | \] | \^) | # adding ^ for Power Fx, making it clear that the carets in [^^] have different meanings
(?<badDoubleInCharClass> << | == | >> | :: | # reserved pairs, see https://262.ecma-international.org/#prod-ClassSetReservedDoublePunctuator
@@ | `` | ~~ | %% | && | ;; | ,, | !! | # and https://www.unicode.org/reports/tr18/#Subtraction_and_Intersection
\|\| | \#\# | \$\$ | \*\* | \+\+ | \.\. | # includes set subtraction
\?\? | \^\^ | \-\-)
", RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
int captureNumber = 0; // last numbered capture encountered
var captureStack = new Stack<string>(); // stack of all open capture groups, including null for non capturing groups, for detecting if a named group is closed
var captureNames = new List<string>(); // list of seen named groups, does not included numbered groups or non capture groups
bool openPoundComment = false; // there is an open end-of-line pound comment, only in freeFormMode
bool openInlineComment = false; // there is an open inline comment
foreach (Match token in generalRE.Matches(regexPattern))
{
void RegExError(ErrorResourceKey errKey, Match errToken = null, bool context = false)
{
if (errToken == null)
{
errToken = token;
}
if (context)
{
const int contextLength = 8;
var tokenEnd = errToken.Index + errToken.Length;
var found = tokenEnd >= contextLength ? "..." + regexPattern.Substring(tokenEnd - contextLength, contextLength) : regexPattern.Substring(0, tokenEnd);
errors.EnsureError(regExNode, errKey, found);
}
else
{
errors.EnsureError(regExNode, errKey, errToken.Value);
}
}
if (token.Groups["newline"].Success)
{
openPoundComment = false;
}
else if (openInlineComment && (token.Groups["closeParen"].Success || token.Groups["goodEscape"].Value == "\\)"))
{
openInlineComment = false;
}
else if (!openPoundComment && !openInlineComment)
{
if (token.Groups["goodEscape"].Success || token.Groups["goodQuantifiers"].Success || token.Groups["goodExact"].Success || token.Groups["goodLimited"].Success || token.Groups["goodEscapeOutsideCC"].Success || token.Groups["goodEscapeOutsideAndInsideCCIfPositive"].Success)
{
// all is well, nothing to do
}
else if (token.Groups["characterClass"].Success)
{
bool characterClassNegative = token.Groups["characterClass"].Value[0] == '^';
string ccString = characterClassNegative ? token.Groups["characterClass"].Value.Substring(1) : token.Groups["characterClass"].Value;
foreach (Match ccToken in characterClassRE.Matches(ccString))
{
void CCRegExError(ErrorResourceKey errKey)
{
RegExError(errKey, errToken: ccToken);
}
if (ccToken.Groups["goodEscape"].Success || ccToken.Groups["goodEscapeInsideCCOnly"].Success)
{
// all good, nothing to do
}
else if (ccToken.Groups["goodEscapeOutsideAndInsideCCIfPositive"].Success)
{
if (characterClassNegative)
{
CCRegExError(TexlStrings.ErrInvalidRegExBadEscapeInsideNegativeCharacterClass);
return false;
}
}
else if (ccToken.Groups["goodUEscape"].Success)
{
if (ccToken.Groups["goodUEscape"].Value == "P" && characterClassNegative)
{
// would be problematic for us to allow this if we wanted to implement MatchOptions.LocaleAware in the future
CCRegExError(TexlStrings.ErrInvalidRegExBadEscapeInsideNegativeCharacterClass);
return false;
}
if (!UnicodeCategories.Contains(ccToken.Groups["UCategory"].Value))
{
CCRegExError(TexlStrings.ErrInvalidRegExBadUnicodeCategory);
return false;
}
}
else if (ccToken.Groups["badEscape"].Success)
{
CCRegExError(TexlStrings.ErrInvalidRegExBadEscape);
return false;
}
else if (ccToken.Groups["goodEscapeOutsideCC"].Success || ccToken.Groups["backRefName"].Success || ccToken.Groups["backRefNumber"].Success)
{
CCRegExError(TexlStrings.ErrInvalidRegExBadEscapeInsideCharacterClass);
return false;
}
else if (ccToken.Groups["badOctal"].Success)
{
CCRegExError(TexlStrings.ErrInvalidRegExBadOctal);
return false;
}
else if (ccToken.Groups["badInCharClass"].Success)
{
CCRegExError(TexlStrings.ErrInvalidRegExUnescapedCharInCharacterClass);
return false;
}
else if (ccToken.Groups["badDoubleInCharClass"].Success)
{
CCRegExError(TexlStrings.ErrInvalidRegExRepeatInCharClass);
return false;
}
else if (ccToken.Groups["badHyphen"].Success)
{
// intentionally RegExError to get the whole character class as this is on the ends
RegExError(TexlStrings.ErrInvalidRegExLiteralHyphenInCharacterClass);
return false;
}
else
{
// This should never be hit. It is here in case one of the names checked doesn't match the RE, in which case running tests would hit this.
throw new NotImplementedException("Unknown character class regular expression match: CC = " + token.Value + ", ccToken = " + ccToken.Value);
}
}
}
else if (token.Groups["goodNamedCapture"].Success)
{
var namedCapture = token.Groups["goodNamedCapture"].Value;
if (numberedCpature)
{
RegExError(TexlStrings.ErrInvalidRegExMixingNamedAndNumberedSubMatches);
return false;
}
if (captureNames.Contains(namedCapture))
{
RegExError(TexlStrings.ErrInvalidRegExBadNamedCaptureAlreadyExists);
return false;
}
captureStack.Push(namedCapture);
captureNames.Add(namedCapture);
}
else if (token.Groups["goodNonCapture"].Success || token.Groups["goodLookaround"].Success)
{
captureStack.Push(null);
}
else if (token.Groups["openParen"].Success)
{
if (numberedCpature)
{
captureNumber++;
captureStack.Push(captureNumber.ToString(CultureInfo.InvariantCulture));
}
else
{
captureStack.Push(null);
}
}
else if (token.Groups["closeParen"].Success)
{
if (captureStack.Count == 0)
{
RegExError(TexlStrings.ErrInvalidRegExUnopenedCaptureGroups, context: true);
return false;
}
else
{
captureStack.Pop();
}
}
else if (token.Groups["backRefName"].Success)
{
var backRefName = token.Groups["backRefName"].Value;
if (numberedCpature)
{
RegExError(TexlStrings.ErrInvalidRegExMixingNamedAndNumberedSubMatches);
return false;
}
// group isn't defined, or not defined yet
if (!captureNames.Contains(backRefName))
{
RegExError(TexlStrings.ErrInvalidRegExBadBackRefNotDefined);
return false;
}
// group is not closed and thus self referencing
if (captureStack.Contains(backRefName))
{
RegExError(TexlStrings.ErrInvalidRegExBadBackRefSelfReferencing);
return false;
}
}
else if (token.Groups["backRefNumber"].Success)
{
var backRef = token.Groups["backRefNumber"].Value;
var backRefNumber = Convert.ToInt32(backRef, CultureInfo.InvariantCulture);
if (!numberedCpature)
{
RegExError(TexlStrings.ErrInvalidRegExNumberedSubMatchesDisabled);
return false;
}
// back ref number has not yet been defined
if (backRefNumber < 1 || backRefNumber > captureNumber)
{
RegExError(TexlStrings.ErrInvalidRegExBadBackRefNotDefined);
return false;
}
// group is not closed and thus self referencing
if (captureStack.Contains(backRef))
{
RegExError(TexlStrings.ErrInvalidRegExBadBackRefSelfReferencing);
return false;
}
}
else if (token.Groups["goodUEscape"].Success)
{
if (!UnicodeCategories.Contains(token.Groups["UCategory"].Value))
{
RegExError(TexlStrings.ErrInvalidRegExBadUnicodeCategory);
return false;
}
}
else if (token.Groups["goodInlineOptions"].Success)
{
var inlineOptions = token.Groups["goodInlineOptions"].Value;
if (Regex.IsMatch(inlineOptions, @"(?<char>.).*\k<char>"))
{
RegExError(TexlStrings.ErrInvalidRegExRepeatedInlineOption);
return false;
}
if (inlineOptions.Contains("n") && numberedCpature)
{
RegExError(TexlStrings.ErrInvalidRegExInlineOptionConflictsWithNumberedSubMatches);
return false;
}
if (inlineOptions.Contains("x"))
{
freeSpacing = true;
}
}
else if (token.Groups["goodInlineComment"].Success)
{
openInlineComment = true;
}
else if (token.Groups["poundComment"].Success)
{
openPoundComment = freeSpacing;
}
else if (token.Groups["badNamedCaptureName"].Success)
{
RegExError(TexlStrings.ErrInvalidRegExBadNamedCaptureName);
return false;
}
else if (token.Groups["badOctal"].Success)
{
RegExError(TexlStrings.ErrInvalidRegExBadOctal);
return false;
}
else if (token.Groups["badBalancing"].Success)
{
RegExError(TexlStrings.ErrInvalidRegExBadBalancing);
return false;
}
else if (token.Groups["badInlineOptions"].Success)
{
RegExError(token.Groups["badInlineOptions"].Index > 0 ? TexlStrings.ErrInvalidRegExInlineOptionNotAtStart : TexlStrings.ErrInvalidRegExBadInlineOptions);
return false;
}
else if (token.Groups["badSingleQuoteNamedCapture"].Success)
{
RegExError(TexlStrings.ErrInvalidRegExBadSingleQuoteNamedCapture);
return false;
}
else if (token.Groups["badConditional"].Success)
{
RegExError(TexlStrings.ErrInvalidRegExBadConditional);
return false;
}
else if (token.Groups["badEscape"].Success)
{
RegExError(TexlStrings.ErrInvalidRegExBadEscape);
return false;
}
else if (token.Groups["goodEscapeInsideCCOnly"].Success)
{
RegExError(TexlStrings.ErrInvalidRegExBadEscapeOutsideCharacterClass);
return false;
}
else if (token.Groups["badQuantifiers"].Success || token.Groups["badLimited"].Success)
{
RegExError(TexlStrings.ErrInvalidRegExBadQuantifier);
return false;
}
else if (token.Groups["badExact"].Success)
{
RegExError(TexlStrings.ErrInvalidRegExBadExactQuantifier);
return false;
}
else if (token.Groups["badCurly"].Success)
{
RegExError(TexlStrings.ErrInvalidRegExBadCurly);
return false;
}
else if (token.Groups["badParen"].Success)
{
RegExError(TexlStrings.ErrInvalidRegExBadParen, context: true);
return false;
}
else if (token.Groups["badSquareBrackets"].Success)
{
RegExError(TexlStrings.ErrInvalidRegExBadSquare, context: true);
return false;
}
else if (token.Groups["badEmptyCharacterClass"].Success)
{
RegExError(TexlStrings.ErrInvalidRegExEmptyCharacterClass);
return false;
}
else
{
// This should never be hit. It is here in case one of the Groups names checked doesn't match the RE, in which case running tests would hit this.
throw new NotImplementedException("Unknown general regular expression match: " + token.Value);
}
}
}
if (openInlineComment)
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExUnclosedInlineComment);
return false;
}
if (captureStack.Count > 0)
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegExUnclosedCaptureGroups);
return false;
}
// may be modifed by inline options; we only care about x and N in the next stage
alteredOptions = (freeSpacing ? "x" : string.Empty) + (numberedCpature ? "N" : string.Empty);
return true;
}
// Creates a typed result: [Match:s, Captures:*[Value:s], NamedCaptures:r[<namedCaptures>:s]]
private bool TryCreateReturnType(TexlNode regExNode, string regexPattern, string alteredOptions, IErrorContainer errors, ref DType returnType)
{
Contracts.AssertValue(regexPattern);
string prefixedRegexPattern = this._cachePrefix + regexPattern;
if (_regexTypeCache != null && _regexTypeCache.ContainsKey(prefixedRegexPattern))
{
var cachedType = _regexTypeCache[prefixedRegexPattern];
if (cachedType != null)
{
returnType = cachedType.Item1;
AddWarnings(regExNode, errors, cachedType.Item2, cachedType.Item3, cachedType.Item4);
return true;
}
else
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegEx);
return false;
}
}
if (_regexTypeCache != null && _regexTypeCache.Count >= _regexCacheSize)
{
// To preserve memory during authoring, we clear the cache if it gets
// too large. This should only happen in a minority of cases and
// should have no impact on deployed apps.
_regexTypeCache.Clear();
}
try
{
var regexDotNetOptions = RegexOptions.None;
if (alteredOptions.Contains("x"))
{
regexDotNetOptions |= RegexOptions.IgnorePatternWhitespace;
// In x mode, comment line endings are [\r\n], but .NET only supports \n. For our purposes here, we can just replace the \r.
regexPattern = regexPattern.Replace('\r', '\n');
}
var regex = new Regex(regexPattern, regexDotNetOptions);
List<TypedName> propertyNames = new List<TypedName>();
bool fullMatchHidden = false, subMatchesHidden = false, startMatchHidden = false;
foreach (var captureName in regex.GetGroupNames())
{
if (int.TryParse(captureName, out _))
{
// Unnamed captures are returned as integers, ignoring them
continue;
}
if (captureName == ColumnName_FullMatch.Value)
{
fullMatchHidden = true;
}
else if (captureName == ColumnName_SubMatches.Value)
{
subMatchesHidden = true;
}
else if (captureName == ColumnName_StartMatch.Value)
{
startMatchHidden = true;
}
propertyNames.Add(new TypedName(DType.String, DName.MakeValid(captureName, out _)));
}
if (!fullMatchHidden)
{
propertyNames.Add(new TypedName(DType.String, ColumnName_FullMatch));
}
if (!subMatchesHidden && alteredOptions.Contains("N"))
{
propertyNames.Add(new TypedName(DType.CreateTable(new TypedName(DType.String, ColumnName_Value)), ColumnName_SubMatches));
}
if (!startMatchHidden)
{
propertyNames.Add(new TypedName(DType.Number, ColumnName_StartMatch));
}
returnType = returnType.IsRecord
? DType.CreateRecord(propertyNames)
: DType.CreateTable(propertyNames);
AddWarnings(regExNode, errors, hidesFullMatch: fullMatchHidden, hidesSubMatches: subMatchesHidden, hidesStartMatch: startMatchHidden);
if (_regexTypeCache != null)
{
_regexTypeCache[prefixedRegexPattern] = Tuple.Create(returnType, fullMatchHidden, subMatchesHidden, startMatchHidden);
}
return true;
}
catch (ArgumentException)
{
errors.EnsureError(regExNode, TexlStrings.ErrInvalidRegEx);
if (_regexTypeCache != null)
{
_regexTypeCache[prefixedRegexPattern] = null; // Cache to avoid evaluating again
}
return false;
}
}
private void AddWarnings(TexlNode regExNode, IErrorContainer errors, bool hidesFullMatch, bool hidesSubMatches, bool hidesStartMatch)
{
if (hidesFullMatch)
{
errors.EnsureError(DocumentErrorSeverity.Suggestion, regExNode, TexlStrings.InfoRegExCaptureNameHidesPredefinedFullMatchField, ColumnName_FullMatch.Value);
}
if (hidesSubMatches)
{
errors.EnsureError(DocumentErrorSeverity.Suggestion, regExNode, TexlStrings.InfoRegExCaptureNameHidesPredefinedSubMatchesField, ColumnName_SubMatches.Value);
}
if (hidesStartMatch)
{
errors.EnsureError(DocumentErrorSeverity.Suggestion, regExNode, TexlStrings.InfoRegExCaptureNameHidesPredefinedStartMatchField, ColumnName_StartMatch.Value);
}
}
}
}
#pragma warning restore SA1402 // File may only contain a single type
#pragma warning restore SA1649 // File name should match first type name²