Skip to content

Commit 49182c3

Browse files
Copilotstephentoub
andcommitted
Factor out alternation starting char detection into RegexNode helper
Add TryGetAlternationStartingChars helper method on RegexNode that determines whether every branch of an alternation node begins with unique starting characters. This method is now shared by both RegexCompiler.cs and RegexGenerator.Emitter.cs, reducing code duplication and keeping both implementations synchronized. Co-authored-by: stephentoub <2642209+stephentoub@users.noreply.github.com>
1 parent 03e7c36 commit 49182c3

File tree

3 files changed

+74
-92
lines changed

3 files changed

+74
-92
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 2 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1848,53 +1848,9 @@ bool TryEmitAlternationAsSwitch()
18481848
}
18491849

18501850
// Detect whether every branch begins with one or more unique characters.
1851-
const int SetCharsSize = 64; // arbitrary limit; we want it to be large enough to handle ignore-case of common sets, like hex, the latin alphabet, etc.
1852-
Span<char> setChars = stackalloc char[SetCharsSize];
1853-
1854-
// Iterate through every branch, seeing if we can easily find a starting One, Multi, or small Set.
1855-
// If we can, extract its starting char (or multiple in the case of a set), validate that all such
1856-
// starting characters are unique relative to all the branches.
1857-
var seenChars = new HashSet<char>();
1858-
for (int i = 0; i < childCount; i++)
1851+
if (!node.TryGetAlternationStartingChars(out _))
18591852
{
1860-
// Look for the guaranteed starting node that's a one, multi, set,
1861-
// or loop of one of those with at least one minimum iteration. We need to exclude notones.
1862-
if (node.Child(i).FindStartingLiteralNode(allowZeroWidth: false) is not RegexNode startingLiteralNode ||
1863-
startingLiteralNode.IsNotoneFamily)
1864-
{
1865-
return false;
1866-
}
1867-
1868-
// If it's a One or a Multi, get the first character and add it to the set.
1869-
// If it was already in the set, we can't apply this optimization.
1870-
if (startingLiteralNode.IsOneFamily || startingLiteralNode.Kind is RegexNodeKind.Multi)
1871-
{
1872-
if (!seenChars.Add(startingLiteralNode.FirstCharOfOneOrMulti()))
1873-
{
1874-
return false;
1875-
}
1876-
}
1877-
else
1878-
{
1879-
// The branch begins with a set. Make sure it's a set of only a few characters
1880-
// and get them. If we can't, we can't apply this optimization.
1881-
Debug.Assert(startingLiteralNode.IsSetFamily);
1882-
int numChars;
1883-
if (RegexCharClass.IsNegated(startingLiteralNode.Str!) ||
1884-
(numChars = RegexCharClass.GetSetChars(startingLiteralNode.Str!, setChars)) == 0)
1885-
{
1886-
return false;
1887-
}
1888-
1889-
// Check to make sure each of the chars is unique relative to all other branches examined.
1890-
foreach (char c in setChars.Slice(0, numChars))
1891-
{
1892-
if (!seenChars.Add(c))
1893-
{
1894-
return false;
1895-
}
1896-
}
1897-
}
1853+
return false;
18981854
}
18991855

19001856
EmitSwitchedBranches();

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

Lines changed: 2 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1896,53 +1896,9 @@ bool TryEmitAlternationAsSwitch(RegexNode node, int childCount, bool isAtomic)
18961896
}
18971897

18981898
// Detect whether every branch begins with one or more unique characters.
1899-
const int SetCharsSize = 64; // arbitrary limit; we want it to be large enough to handle ignore-case of common sets, like hex, the latin alphabet, etc.
1900-
Span<char> setChars = stackalloc char[SetCharsSize];
1901-
var seenChars = new HashSet<char>();
1902-
1903-
// Iterate through every branch, seeing if we can easily find a starting One, Multi, or small Set.
1904-
// If we can, extract its starting char (or multiple in the case of a set), validate that all such
1905-
// starting characters are unique relative to all the branches.
1906-
for (int i = 0; i < childCount; i++)
1899+
if (!node.TryGetAlternationStartingChars(out HashSet<char>? seenChars))
19071900
{
1908-
// Look for the guaranteed starting node that's a one, multi, set,
1909-
// or loop of one of those with at least one minimum iteration. We need to exclude notones.
1910-
if (node.Child(i).FindStartingLiteralNode(allowZeroWidth: false) is not RegexNode startingLiteralNode ||
1911-
startingLiteralNode.IsNotoneFamily)
1912-
{
1913-
return false;
1914-
}
1915-
1916-
// If it's a One or a Multi, get the first character and add it to the set.
1917-
// If it was already in the set, we can't apply this optimization.
1918-
if (startingLiteralNode.IsOneFamily || startingLiteralNode.Kind is RegexNodeKind.Multi)
1919-
{
1920-
if (!seenChars.Add(startingLiteralNode.FirstCharOfOneOrMulti()))
1921-
{
1922-
return false;
1923-
}
1924-
}
1925-
else
1926-
{
1927-
// The branch begins with a set. Make sure it's a set of only a few characters
1928-
// and get them. If we can't, we can't apply this optimization.
1929-
Debug.Assert(startingLiteralNode.IsSetFamily);
1930-
int numChars;
1931-
if (RegexCharClass.IsNegated(startingLiteralNode.Str!) ||
1932-
(numChars = RegexCharClass.GetSetChars(startingLiteralNode.Str!, setChars)) == 0)
1933-
{
1934-
return false;
1935-
}
1936-
1937-
// Check to make sure each of the chars is unique relative to all other branches examined.
1938-
foreach (char c in setChars.Slice(0, numChars))
1939-
{
1940-
if (!seenChars.Add(c))
1941-
{
1942-
return false;
1943-
}
1944-
}
1945-
}
1901+
return false;
19461902
}
19471903

19481904
// Compute min/max to determine density for choosing IL switch vs comparisons

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1500,6 +1500,76 @@ public char FirstCharOfOneOrMulti()
15001500
return IsOneFamily ? Ch : Str![0];
15011501
}
15021502

1503+
/// <summary>
1504+
/// Determines whether every branch of this alternation node begins with one or more unique starting characters.
1505+
/// </summary>
1506+
/// <param name="seenChars">The set of unique starting characters across all branches.</param>
1507+
/// <returns>true if all branches have unique starting characters; otherwise, false.</returns>
1508+
/// <remarks>
1509+
/// This method is used to determine if an alternation can be optimized using a switch on the first character.
1510+
/// </remarks>
1511+
public bool TryGetAlternationStartingChars([NotNullWhen(true)] out HashSet<char>? seenChars)
1512+
{
1513+
Debug.Assert(Kind is RegexNodeKind.Alternate);
1514+
Debug.Assert((Options & RegexOptions.RightToLeft) == 0);
1515+
1516+
const int SetCharsSize = 64; // arbitrary limit; we want it to be large enough to handle ignore-case of common sets, like hex, the latin alphabet, etc.
1517+
Span<char> setChars = stackalloc char[SetCharsSize];
1518+
seenChars = new HashSet<char>();
1519+
1520+
// Iterate through every branch, seeing if we can easily find a starting One, Multi, or small Set.
1521+
// If we can, extract its starting char (or multiple in the case of a set), validate that all such
1522+
// starting characters are unique relative to all the branches.
1523+
int childCount = ChildCount();
1524+
for (int i = 0; i < childCount; i++)
1525+
{
1526+
// Look for the guaranteed starting node that's a one, multi, set,
1527+
// or loop of one of those with at least one minimum iteration. We need to exclude notones.
1528+
if (Child(i).FindStartingLiteralNode(allowZeroWidth: false) is not RegexNode startingLiteralNode ||
1529+
startingLiteralNode.IsNotoneFamily)
1530+
{
1531+
seenChars = null;
1532+
return false;
1533+
}
1534+
1535+
// If it's a One or a Multi, get the first character and add it to the set.
1536+
// If it was already in the set, we can't apply this optimization.
1537+
if (startingLiteralNode.IsOneFamily || startingLiteralNode.Kind is RegexNodeKind.Multi)
1538+
{
1539+
if (!seenChars.Add(startingLiteralNode.FirstCharOfOneOrMulti()))
1540+
{
1541+
seenChars = null;
1542+
return false;
1543+
}
1544+
}
1545+
else
1546+
{
1547+
// The branch begins with a set. Make sure it's a set of only a few characters
1548+
// and get them. If we can't, we can't apply this optimization.
1549+
Debug.Assert(startingLiteralNode.IsSetFamily);
1550+
int numChars;
1551+
if (RegexCharClass.IsNegated(startingLiteralNode.Str!) ||
1552+
(numChars = RegexCharClass.GetSetChars(startingLiteralNode.Str!, setChars)) == 0)
1553+
{
1554+
seenChars = null;
1555+
return false;
1556+
}
1557+
1558+
// Check to make sure each of the chars is unique relative to all other branches examined.
1559+
foreach (char c in setChars.Slice(0, numChars))
1560+
{
1561+
if (!seenChars.Add(c))
1562+
{
1563+
seenChars = null;
1564+
return false;
1565+
}
1566+
}
1567+
}
1568+
}
1569+
1570+
return true;
1571+
}
1572+
15031573
/// <summary>Finds the guaranteed beginning literal(s) of the node, or null if none exists.</summary>
15041574
public RegexNode? FindStartingLiteralNode(bool allowZeroWidth = true)
15051575
{

0 commit comments

Comments
 (0)