Limit recursion depth in markdown parser to prevent stack overflow (#1273)
Some checks failed
docker / pack (push) Has been cancelled
docker / deploy (push) Has been cancelled
main / format (push) Has been cancelled
main / test (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, linux-arm) (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, linux-arm64) (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, linux-musl-x64) (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, linux-x64) (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, osx-arm64) (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, osx-x64) (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, win-arm64) (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, win-x64) (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, win-x86) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, linux-arm) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, linux-arm64) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, linux-musl-x64) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, linux-x64) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, osx-arm64) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, osx-x64) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, win-arm64) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, win-x64) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, win-x86) (push) Has been cancelled
main / release (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, linux-arm) (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, linux-arm64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, linux-musl-x64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, linux-x64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, osx-arm64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, osx-x64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, win-arm64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, win-x64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, win-x86) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, linux-arm) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, linux-arm64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, linux-musl-x64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, linux-x64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, osx-arm64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, osx-x64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, win-arm64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, win-x64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, win-x86) (push) Has been cancelled
main / notify (push) Has been cancelled

This commit is contained in:
Oleksii Holub 2024-08-14 23:52:03 +03:00 committed by GitHub
parent 10adba3a4d
commit 522789e01d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 288 additions and 232 deletions

View file

@ -2,20 +2,22 @@
namespace DiscordChatExporter.Core.Markdown.Parsing; namespace DiscordChatExporter.Core.Markdown.Parsing;
internal class AggregateMatcher<T>(IReadOnlyList<IMatcher<T>> matchers) : IMatcher<T> internal class AggregateMatcher<TContext, TValue>(
IReadOnlyList<IMatcher<TContext, TValue>> matchers
) : IMatcher<TContext, TValue>
{ {
public AggregateMatcher(params IMatcher<T>[] matchers) public AggregateMatcher(params IMatcher<TContext, TValue>[] matchers)
: this((IReadOnlyList<IMatcher<T>>)matchers) { } : this((IReadOnlyList<IMatcher<TContext, TValue>>)matchers) { }
public ParsedMatch<T>? TryMatch(StringSegment segment) public ParsedMatch<TValue>? TryMatch(TContext context, StringSegment segment)
{ {
ParsedMatch<T>? earliestMatch = null; ParsedMatch<TValue>? earliestMatch = null;
// Try to match the input with each matcher and get the match with the lowest start index // Try to match the input with each matcher and get the match with the lowest start index
foreach (var matcher in matchers) foreach (var matcher in matchers)
{ {
// Try to match // Try to match
var match = matcher.TryMatch(segment); var match = matcher.TryMatch(context, segment);
// If there's no match - continue // If there's no match - continue
if (match is null) if (match is null)

View file

@ -3,17 +3,18 @@ using System.Collections.Generic;
namespace DiscordChatExporter.Core.Markdown.Parsing; namespace DiscordChatExporter.Core.Markdown.Parsing;
internal interface IMatcher<T> internal interface IMatcher<in TContext, TValue>
{ {
ParsedMatch<T>? TryMatch(StringSegment segment); ParsedMatch<TValue>? TryMatch(TContext context, StringSegment segment);
} }
internal static class MatcherExtensions internal static class MatcherExtensions
{ {
public static IEnumerable<ParsedMatch<T>> MatchAll<T>( public static IEnumerable<ParsedMatch<TValue>> MatchAll<TContext, TValue>(
this IMatcher<T> matcher, this IMatcher<TContext, TValue> matcher,
TContext context,
StringSegment segment, StringSegment segment,
Func<StringSegment, T> transformFallback Func<TContext, StringSegment, TValue> transformFallback
) )
{ {
// Loop through segments divided by individual matches // Loop through segments divided by individual matches
@ -22,6 +23,7 @@ internal static class MatcherExtensions
{ {
// Find a match within this segment // Find a match within this segment
var match = matcher.TryMatch( var match = matcher.TryMatch(
context,
segment.Relocate(currentIndex, segment.EndIndex - currentIndex) segment.Relocate(currentIndex, segment.EndIndex - currentIndex)
); );
@ -36,9 +38,9 @@ internal static class MatcherExtensions
match.Segment.StartIndex - currentIndex match.Segment.StartIndex - currentIndex
); );
yield return new ParsedMatch<T>( yield return new ParsedMatch<TValue>(
fallbackSegment, fallbackSegment,
transformFallback(fallbackSegment) transformFallback(context, fallbackSegment)
); );
} }
@ -53,7 +55,10 @@ internal static class MatcherExtensions
{ {
var fallbackSegment = segment.Relocate(currentIndex, segment.EndIndex - currentIndex); var fallbackSegment = segment.Relocate(currentIndex, segment.EndIndex - currentIndex);
yield return new ParsedMatch<T>(fallbackSegment, transformFallback(fallbackSegment)); yield return new ParsedMatch<TValue>(
fallbackSegment,
transformFallback(context, fallbackSegment)
);
} }
} }
} }

View file

@ -0,0 +1,3 @@
namespace DiscordChatExporter.Core.Markdown.Parsing;
internal readonly record struct MarkdownContext(int Depth = 0);

View file

@ -23,15 +23,15 @@ internal static partial class MarkdownParser
/* Formatting */ /* Formatting */
private static readonly IMatcher<MarkdownNode> BoldFormattingNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> BoldFormattingNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// There must be exactly two closing asterisks. // There must be exactly two closing asterisks.
new Regex(@"\*\*(.+?)\*\*(?!\*)", DefaultRegexOptions | RegexOptions.Singleline), new Regex(@"\*\*(.+?)\*\*(?!\*)", DefaultRegexOptions | RegexOptions.Singleline),
(s, m) => new FormattingNode(FormattingKind.Bold, Parse(s.Relocate(m.Groups[1]))) (c, s, m) => new FormattingNode(FormattingKind.Bold, Parse(c, s.Relocate(m.Groups[1])))
); );
private static readonly IMatcher<MarkdownNode> ItalicFormattingNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> ItalicFormattingNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// There must be exactly one closing asterisk. // There must be exactly one closing asterisk.
// Opening asterisk must not be followed by whitespace. // Opening asterisk must not be followed by whitespace.
// Closing asterisk must not be preceded by whitespace. // Closing asterisk must not be preceded by whitespace.
@ -39,156 +39,174 @@ internal static partial class MarkdownParser
@"\*(?!\s)(.+?)(?<!\s|\*)\*(?!\*)", @"\*(?!\s)(.+?)(?<!\s|\*)\*(?!\*)",
DefaultRegexOptions | RegexOptions.Singleline DefaultRegexOptions | RegexOptions.Singleline
), ),
(s, m) => new FormattingNode(FormattingKind.Italic, Parse(s.Relocate(m.Groups[1]))) (c, s, m) =>
new FormattingNode(FormattingKind.Italic, Parse(c, s.Relocate(m.Groups[1])))
); );
private static readonly IMatcher<MarkdownNode> ItalicBoldFormattingNodeMatcher = private static readonly IMatcher<
new RegexMatcher<MarkdownNode>( MarkdownContext,
MarkdownNode
> ItalicBoldFormattingNodeMatcher = new RegexMatcher<MarkdownContext, MarkdownNode>(
// There must be exactly three closing asterisks. // There must be exactly three closing asterisks.
new Regex(@"\*(\*\*.+?\*\*)\*(?!\*)", DefaultRegexOptions | RegexOptions.Singleline), new Regex(@"\*(\*\*.+?\*\*)\*(?!\*)", DefaultRegexOptions | RegexOptions.Singleline),
(s, m) => (c, s, m) =>
new FormattingNode( new FormattingNode(
FormattingKind.Italic, FormattingKind.Italic,
Parse(s.Relocate(m.Groups[1]), BoldFormattingNodeMatcher) Parse(c, s.Relocate(m.Groups[1]), BoldFormattingNodeMatcher)
) )
); );
private static readonly IMatcher<MarkdownNode> ItalicAltFormattingNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> ItalicAltFormattingNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// Closing underscore must not be followed by a word character. // Closing underscore must not be followed by a word character.
new Regex(@"_(.+?)_(?!\w)", DefaultRegexOptions | RegexOptions.Singleline), new Regex(@"_(.+?)_(?!\w)", DefaultRegexOptions | RegexOptions.Singleline),
(s, m) => new FormattingNode(FormattingKind.Italic, Parse(s.Relocate(m.Groups[1]))) (c, s, m) =>
new FormattingNode(FormattingKind.Italic, Parse(c, s.Relocate(m.Groups[1])))
); );
private static readonly IMatcher<MarkdownNode> UnderlineFormattingNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> UnderlineFormattingNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// There must be exactly two closing underscores. // There must be exactly two closing underscores.
new Regex(@"__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline), new Regex(@"__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
(s, m) => new FormattingNode(FormattingKind.Underline, Parse(s.Relocate(m.Groups[1]))) (c, s, m) =>
new FormattingNode(FormattingKind.Underline, Parse(c, s.Relocate(m.Groups[1])))
); );
private static readonly IMatcher<MarkdownNode> ItalicUnderlineFormattingNodeMatcher = private static readonly IMatcher<
new RegexMatcher<MarkdownNode>( MarkdownContext,
MarkdownNode
> ItalicUnderlineFormattingNodeMatcher = new RegexMatcher<MarkdownContext, MarkdownNode>(
// There must be exactly three closing underscores. // There must be exactly three closing underscores.
new Regex(@"_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline), new Regex(@"_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
(s, m) => (c, s, m) =>
new FormattingNode( new FormattingNode(
FormattingKind.Italic, FormattingKind.Italic,
Parse(s.Relocate(m.Groups[1]), UnderlineFormattingNodeMatcher) Parse(c, s.Relocate(m.Groups[1]), UnderlineFormattingNodeMatcher)
) )
); );
private static readonly IMatcher<MarkdownNode> StrikethroughFormattingNodeMatcher = private static readonly IMatcher<
new RegexMatcher<MarkdownNode>( MarkdownContext,
MarkdownNode
> StrikethroughFormattingNodeMatcher = new RegexMatcher<MarkdownContext, MarkdownNode>(
new Regex(@"~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline), new Regex(@"~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline),
(s, m) => (c, s, m) =>
new FormattingNode(FormattingKind.Strikethrough, Parse(s.Relocate(m.Groups[1]))) new FormattingNode(FormattingKind.Strikethrough, Parse(c, s.Relocate(m.Groups[1])))
); );
private static readonly IMatcher<MarkdownNode> SpoilerFormattingNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> SpoilerFormattingNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
new Regex(@"\|\|(.+?)\|\|", DefaultRegexOptions | RegexOptions.Singleline), new Regex(@"\|\|(.+?)\|\|", DefaultRegexOptions | RegexOptions.Singleline),
(s, m) => new FormattingNode(FormattingKind.Spoiler, Parse(s.Relocate(m.Groups[1]))) (c, s, m) =>
new FormattingNode(FormattingKind.Spoiler, Parse(c, s.Relocate(m.Groups[1])))
); );
private static readonly IMatcher<MarkdownNode> SingleLineQuoteNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> SingleLineQuoteNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// Include the linebreak in the content so that the lines are preserved in quotes. // Include the linebreak in the content so that the lines are preserved in quotes.
new Regex(@"^>\s(.+\n?)", DefaultRegexOptions), new Regex(@"^>\s(.+\n?)", DefaultRegexOptions),
(s, m) => new FormattingNode(FormattingKind.Quote, Parse(s.Relocate(m.Groups[1]))) (c, s, m) => new FormattingNode(FormattingKind.Quote, Parse(c, s.Relocate(m.Groups[1])))
); );
private static readonly IMatcher<MarkdownNode> RepeatedSingleLineQuoteNodeMatcher = private static readonly IMatcher<
new RegexMatcher<MarkdownNode>( MarkdownContext,
MarkdownNode
> RepeatedSingleLineQuoteNodeMatcher = new RegexMatcher<MarkdownContext, MarkdownNode>(
// Include the linebreaks in the content, so that the lines are preserved in quotes. // Include the linebreaks in the content, so that the lines are preserved in quotes.
// Empty content is allowed within quotes. // Empty content is allowed within quotes.
// https://github.com/Tyrrrz/DiscordChatExporter/issues/1115 // https://github.com/Tyrrrz/DiscordChatExporter/issues/1115
new Regex(@"(?:^>\s(.*\n?)){2,}", DefaultRegexOptions), new Regex(@"(?:^>\s(.*\n?)){2,}", DefaultRegexOptions),
(s, m) => (c, s, m) =>
new FormattingNode( new FormattingNode(
FormattingKind.Quote, FormattingKind.Quote,
m.Groups[1].Captures.SelectMany(c => Parse(s.Relocate(c))).ToArray() m.Groups[1].Captures.SelectMany(r => Parse(c, s.Relocate(r))).ToArray()
) )
); );
private static readonly IMatcher<MarkdownNode> MultiLineQuoteNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> MultiLineQuoteNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
new Regex(@"^>>>\s(.+)", DefaultRegexOptions | RegexOptions.Singleline), new Regex(@"^>>>\s(.+)", DefaultRegexOptions | RegexOptions.Singleline),
(s, m) => new FormattingNode(FormattingKind.Quote, Parse(s.Relocate(m.Groups[1]))) (c, s, m) => new FormattingNode(FormattingKind.Quote, Parse(c, s.Relocate(m.Groups[1])))
); );
private static readonly IMatcher<MarkdownNode> HeadingNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> HeadingNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// Consume the linebreak so that it's not attached to following nodes. // Consume the linebreak so that it's not attached to following nodes.
new Regex(@"^(\#{1,3})\s(.+)\n", DefaultRegexOptions), new Regex(@"^(\#{1,3})\s(.+)\n", DefaultRegexOptions),
(s, m) => new HeadingNode(m.Groups[1].Length, Parse(s.Relocate(m.Groups[2]))) (c, s, m) => new HeadingNode(m.Groups[1].Length, Parse(c, s.Relocate(m.Groups[2])))
); );
private static readonly IMatcher<MarkdownNode> ListNodeMatcher = new RegexMatcher<MarkdownNode>( private static readonly IMatcher<MarkdownContext, MarkdownNode> ListNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// Can be preceded by whitespace, which specifies the list's nesting level. // Can be preceded by whitespace, which specifies the list's nesting level.
// Following lines that start with (level+1) whitespace are considered part of the list item. // Following lines that start with (level+1) whitespace are considered part of the list item.
// Consume the linebreak so that it's not attached to following nodes. // Consume the linebreak so that it's not attached to following nodes.
new Regex(@"^(\s*)(?:[\-\*]\s(.+(?:\n\s\1.*)*)?\n?)+", DefaultRegexOptions), new Regex(@"^(\s*)(?:[\-\*]\s(.+(?:\n\s\1.*)*)?\n?)+", DefaultRegexOptions),
(s, m) => (c, s, m) =>
new ListNode( new ListNode(
m.Groups[2].Captures.Select(c => new ListItemNode(Parse(s.Relocate(c)))).ToArray() m.Groups[2]
.Captures.Select(x => new ListItemNode(Parse(c, s.Relocate(x))))
.ToArray()
) )
); );
/* Code blocks */ /* Code blocks */
private static readonly IMatcher<MarkdownNode> InlineCodeBlockNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> InlineCodeBlockNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// One or two backticks are allowed, but they must match on both sides. // One or two backticks are allowed, but they must match on both sides.
new Regex(@"(`{1,2})([^`]+)\1", DefaultRegexOptions | RegexOptions.Singleline), new Regex(@"(`{1,2})([^`]+)\1", DefaultRegexOptions | RegexOptions.Singleline),
(_, m) => new InlineCodeBlockNode(m.Groups[2].Value) (_, _, m) => new InlineCodeBlockNode(m.Groups[2].Value)
); );
private static readonly IMatcher<MarkdownNode> MultiLineCodeBlockNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> MultiLineCodeBlockNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// Language identifier is one word immediately after opening backticks, followed immediately by a linebreak. // Language identifier is one word immediately after opening backticks, followed immediately by a linebreak.
// Blank lines at the beginning and at the end of content are trimmed. // Blank lines at the beginning and at the end of content are trimmed.
new Regex(@"```(?:(\w*)\n)?(.+?)```", DefaultRegexOptions | RegexOptions.Singleline), new Regex(@"```(?:(\w*)\n)?(.+?)```", DefaultRegexOptions | RegexOptions.Singleline),
(_, m) => (_, _, m) =>
new MultiLineCodeBlockNode(m.Groups[1].Value, m.Groups[2].Value.Trim('\r', '\n')) new MultiLineCodeBlockNode(m.Groups[1].Value, m.Groups[2].Value.Trim('\r', '\n'))
); );
/* Mentions */ /* Mentions */
private static readonly IMatcher<MarkdownNode> EveryoneMentionNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> EveryoneMentionNodeMatcher =
new StringMatcher<MarkdownNode>( new StringMatcher<MarkdownContext, MarkdownNode>(
"@everyone", "@everyone",
_ => new MentionNode(null, MentionKind.Everyone) (_, _) => new MentionNode(null, MentionKind.Everyone)
); );
private static readonly IMatcher<MarkdownNode> HereMentionNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> HereMentionNodeMatcher =
new StringMatcher<MarkdownNode>("@here", _ => new MentionNode(null, MentionKind.Here)); new StringMatcher<MarkdownContext, MarkdownNode>(
"@here",
(_, _) => new MentionNode(null, MentionKind.Here)
);
private static readonly IMatcher<MarkdownNode> UserMentionNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> UserMentionNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture <@123456> or <@!123456> // Capture <@123456> or <@!123456>
new Regex(@"<@!?(\d+)>", DefaultRegexOptions), new Regex(@"<@!?(\d+)>", DefaultRegexOptions),
(_, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.User) (_, _, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.User)
); );
private static readonly IMatcher<MarkdownNode> ChannelMentionNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> ChannelMentionNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture <#123456> // Capture <#123456>
new Regex(@"<\#!?(\d+)>", DefaultRegexOptions), new Regex(@"<\#!?(\d+)>", DefaultRegexOptions),
(_, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.Channel) (_, _, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.Channel)
); );
private static readonly IMatcher<MarkdownNode> RoleMentionNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> RoleMentionNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture <@&123456> // Capture <@&123456>
new Regex(@"<@&(\d+)>", DefaultRegexOptions), new Regex(@"<@&(\d+)>", DefaultRegexOptions),
(_, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.Role) (_, _, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.Role)
); );
/* Emoji */ /* Emoji */
private static readonly IMatcher<MarkdownNode> StandardEmojiNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> StandardEmojiNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
new Regex( new Regex(
@"(" @"("
+ +
@ -239,21 +257,21 @@ internal static partial class MarkdownParser
+ @")", + @")",
DefaultRegexOptions DefaultRegexOptions
), ),
(_, m) => new EmojiNode(m.Groups[1].Value) (_, _, m) => new EmojiNode(m.Groups[1].Value)
); );
private static readonly IMatcher<MarkdownNode> CodedStandardEmojiNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> CodedStandardEmojiNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture :thinking: // Capture :thinking:
new Regex(@":([\w_]+):", DefaultRegexOptions), new Regex(@":([\w_]+):", DefaultRegexOptions),
(_, m) => EmojiIndex.TryGetName(m.Groups[1].Value)?.Pipe(n => new EmojiNode(n)) (_, _, m) => EmojiIndex.TryGetName(m.Groups[1].Value)?.Pipe(n => new EmojiNode(n))
); );
private static readonly IMatcher<MarkdownNode> CustomEmojiNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> CustomEmojiNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture <:lul:123456> or <a:lul:123456> // Capture <:lul:123456> or <a:lul:123456>
new Regex(@"<(a)?:(.+?):(\d+?)>", DefaultRegexOptions), new Regex(@"<(a)?:(.+?):(\d+?)>", DefaultRegexOptions),
(_, m) => (_, _, m) =>
new EmojiNode( new EmojiNode(
Snowflake.TryParse(m.Groups[3].Value), Snowflake.TryParse(m.Groups[3].Value),
m.Groups[2].Value, m.Groups[2].Value,
@ -263,70 +281,72 @@ internal static partial class MarkdownParser
/* Links */ /* Links */
private static readonly IMatcher<MarkdownNode> AutoLinkNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> AutoLinkNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// Any non-whitespace character after http:// or https:// // Any non-whitespace character after http:// or https://
// until the last punctuation character or whitespace. // until the last punctuation character or whitespace.
new Regex(@"(https?://\S*[^\.,:;""'\s])", DefaultRegexOptions), new Regex(@"(https?://\S*[^\.,:;""'\s])", DefaultRegexOptions),
(_, m) => new LinkNode(m.Groups[1].Value) (_, _, m) => new LinkNode(m.Groups[1].Value)
); );
private static readonly IMatcher<MarkdownNode> HiddenLinkNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> HiddenLinkNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// Same as auto link but also surrounded by angular brackets // Same as auto link but also surrounded by angular brackets
new Regex(@"<(https?://\S*[^\.,:;""'\s])>", DefaultRegexOptions), new Regex(@"<(https?://\S*[^\.,:;""'\s])>", DefaultRegexOptions),
(_, m) => new LinkNode(m.Groups[1].Value) (_, _, m) => new LinkNode(m.Groups[1].Value)
); );
private static readonly IMatcher<MarkdownNode> MaskedLinkNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> MaskedLinkNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture [title](link) // Capture [title](link)
new Regex(@"\[(.+?)\]\((.+?)\)", DefaultRegexOptions), new Regex(@"\[(.+?)\]\((.+?)\)", DefaultRegexOptions),
(s, m) => new LinkNode(m.Groups[2].Value, Parse(s.Relocate(m.Groups[1]))) (c, s, m) => new LinkNode(m.Groups[2].Value, Parse(c, s.Relocate(m.Groups[1])))
); );
/* Text */ /* Text */
private static readonly IMatcher<MarkdownNode> ShrugTextNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> ShrugTextNodeMatcher =
new StringMatcher<MarkdownNode>( new StringMatcher<MarkdownContext, MarkdownNode>(
// Capture the shrug kaomoji. // Capture the shrug kaomoji.
// This escapes it from matching for formatting. // This escapes it from matching for formatting.
@"¯\_(ツ)_/¯", @"¯\_(ツ)_/¯",
s => new TextNode(s.ToString()) (s, _) => new TextNode(s.ToString())
); );
private static readonly IMatcher<MarkdownNode> IgnoredEmojiTextNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> IgnoredEmojiTextNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture some specific emoji that don't get rendered. // Capture some specific emoji that don't get rendered.
// This escapes them from matching for emoji. // This escapes them from matching for emoji.
new Regex(@"([\u26A7\u2640\u2642\u2695\u267E\u00A9\u00AE\u2122])", DefaultRegexOptions), new Regex(@"([\u26A7\u2640\u2642\u2695\u267E\u00A9\u00AE\u2122])", DefaultRegexOptions),
(_, m) => new TextNode(m.Groups[1].Value) (_, _, m) => new TextNode(m.Groups[1].Value)
); );
private static readonly IMatcher<MarkdownNode> EscapedSymbolTextNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> EscapedSymbolTextNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture any "symbol/other" character or surrogate pair preceded by a backslash. // Capture any "symbol/other" character or surrogate pair preceded by a backslash.
// This escapes them from matching for emoji. // This escapes them from matching for emoji.
// https://github.com/Tyrrrz/DiscordChatExporter/issues/230 // https://github.com/Tyrrrz/DiscordChatExporter/issues/230
new Regex(@"\\(\p{So}|\p{Cs}{2})", DefaultRegexOptions), new Regex(@"\\(\p{So}|\p{Cs}{2})", DefaultRegexOptions),
(_, m) => new TextNode(m.Groups[1].Value) (_, _, m) => new TextNode(m.Groups[1].Value)
); );
private static readonly IMatcher<MarkdownNode> EscapedCharacterTextNodeMatcher = private static readonly IMatcher<
new RegexMatcher<MarkdownNode>( MarkdownContext,
MarkdownNode
> EscapedCharacterTextNodeMatcher = new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture any non-whitespace, non latin alphanumeric character preceded by a backslash. // Capture any non-whitespace, non latin alphanumeric character preceded by a backslash.
// This escapes them from matching for formatting or other tokens. // This escapes them from matching for formatting or other tokens.
new Regex(@"\\([^a-zA-Z0-9\s])", DefaultRegexOptions), new Regex(@"\\([^a-zA-Z0-9\s])", DefaultRegexOptions),
(_, m) => new TextNode(m.Groups[1].Value) (_, _, m) => new TextNode(m.Groups[1].Value)
); );
/* Misc */ /* Misc */
private static readonly IMatcher<MarkdownNode> TimestampNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> TimestampNodeMatcher =
new RegexMatcher<MarkdownNode>( new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture <t:12345678> or <t:12345678:R> // Capture <t:12345678> or <t:12345678:R>
new Regex(@"<t:(-?\d+)(?::(\w))?>", DefaultRegexOptions), new Regex(@"<t:(-?\d+)(?::(\w))?>", DefaultRegexOptions),
(_, m) => (_, _, m) =>
{ {
try try
{ {
@ -382,7 +402,8 @@ internal static partial class MarkdownParser
); );
// Matchers that have similar patterns are ordered from most specific to least specific // Matchers that have similar patterns are ordered from most specific to least specific
private static readonly IMatcher<MarkdownNode> NodeMatcher = new AggregateMatcher<MarkdownNode>( private static readonly IMatcher<MarkdownContext, MarkdownNode> NodeMatcher =
new AggregateMatcher<MarkdownContext, MarkdownNode>(
// Escaped text // Escaped text
ShrugTextNodeMatcher, ShrugTextNodeMatcher,
IgnoredEmojiTextNodeMatcher, IgnoredEmojiTextNodeMatcher,
@ -424,8 +445,8 @@ internal static partial class MarkdownParser
); );
// Minimal set of matchers for non-multimedia formats (e.g. plain text) // Minimal set of matchers for non-multimedia formats (e.g. plain text)
private static readonly IMatcher<MarkdownNode> MinimalNodeMatcher = private static readonly IMatcher<MarkdownContext, MarkdownNode> MinimalNodeMatcher =
new AggregateMatcher<MarkdownNode>( new AggregateMatcher<MarkdownContext, MarkdownNode>(
// Mentions // Mentions
EveryoneMentionNodeMatcher, EveryoneMentionNodeMatcher,
HereMentionNodeMatcher, HereMentionNodeMatcher,
@ -439,24 +460,46 @@ internal static partial class MarkdownParser
); );
private static IReadOnlyList<MarkdownNode> Parse( private static IReadOnlyList<MarkdownNode> Parse(
MarkdownContext context,
StringSegment segment, StringSegment segment,
IMatcher<MarkdownNode> matcher IMatcher<MarkdownContext, MarkdownNode> matcher
) => matcher.MatchAll(segment, s => new TextNode(s.ToString())).Select(r => r.Value).ToArray(); )
{
// Limit recursion depth to a reasonable number to prevent
// stack overflow on messages with inadvertently deep nesting.
// Example: ********************************* (repeat ad nauseam)
// https://github.com/Tyrrrz/DiscordChatExporter/issues/1214
if (context.Depth >= 32)
return [new TextNode(segment.ToString())];
return matcher
.MatchAll(
new MarkdownContext(context.Depth + 1),
segment,
(_, s) => new TextNode(s.ToString())
)
.Select(r => r.Value)
.ToArray();
}
} }
internal static partial class MarkdownParser internal static partial class MarkdownParser
{ {
private static IReadOnlyList<MarkdownNode> Parse(StringSegment segment) => private static IReadOnlyList<MarkdownNode> Parse(
Parse(segment, NodeMatcher); MarkdownContext context,
StringSegment segment
) => Parse(context, segment, NodeMatcher);
public static IReadOnlyList<MarkdownNode> Parse(string markdown) => public static IReadOnlyList<MarkdownNode> Parse(string markdown) =>
Parse(new StringSegment(markdown)); Parse(new MarkdownContext(), new StringSegment(markdown));
private static IReadOnlyList<MarkdownNode> ParseMinimal(StringSegment segment) => private static IReadOnlyList<MarkdownNode> ParseMinimal(
Parse(segment, MinimalNodeMatcher); MarkdownContext context,
StringSegment segment
) => Parse(context, segment, MinimalNodeMatcher);
public static IReadOnlyList<MarkdownNode> ParseMinimal(string markdown) => public static IReadOnlyList<MarkdownNode> ParseMinimal(string markdown) =>
ParseMinimal(new StringSegment(markdown)); ParseMinimal(new MarkdownContext(), new StringSegment(markdown));
private static void ExtractLinks(IEnumerable<MarkdownNode> nodes, ICollection<LinkNode> links) private static void ExtractLinks(IEnumerable<MarkdownNode> nodes, ICollection<LinkNode> links)
{ {

View file

@ -3,9 +3,12 @@ using System.Text.RegularExpressions;
namespace DiscordChatExporter.Core.Markdown.Parsing; namespace DiscordChatExporter.Core.Markdown.Parsing;
internal class RegexMatcher<T>(Regex regex, Func<StringSegment, Match, T?> transform) : IMatcher<T> internal class RegexMatcher<TContext, TValue>(
Regex regex,
Func<TContext, StringSegment, Match, TValue?> transform
) : IMatcher<TContext, TValue>
{ {
public ParsedMatch<T>? TryMatch(StringSegment segment) public ParsedMatch<TValue>? TryMatch(TContext context, StringSegment segment)
{ {
var match = regex.Match(segment.Source, segment.StartIndex, segment.Length); var match = regex.Match(segment.Source, segment.StartIndex, segment.Length);
if (!match.Success) if (!match.Success)
@ -20,8 +23,8 @@ internal class RegexMatcher<T>(Regex regex, Func<StringSegment, Match, T?> trans
return null; return null;
var segmentMatch = segment.Relocate(match); var segmentMatch = segment.Relocate(match);
var value = transform(segmentMatch, match); var value = transform(context, segmentMatch, match);
return value is not null ? new ParsedMatch<T>(segmentMatch, value) : null; return value is not null ? new ParsedMatch<TValue>(segmentMatch, value) : null;
} }
} }

View file

@ -2,16 +2,16 @@
namespace DiscordChatExporter.Core.Markdown.Parsing; namespace DiscordChatExporter.Core.Markdown.Parsing;
internal class StringMatcher<T>( internal class StringMatcher<TContext, TValue>(
string needle, string needle,
StringComparison comparison, StringComparison comparison,
Func<StringSegment, T?> transform Func<TContext, StringSegment, TValue?> transform
) : IMatcher<T> ) : IMatcher<TContext, TValue>
{ {
public StringMatcher(string needle, Func<StringSegment, T> transform) public StringMatcher(string needle, Func<TContext, StringSegment, TValue> transform)
: this(needle, StringComparison.Ordinal, transform) { } : this(needle, StringComparison.Ordinal, transform) { }
public ParsedMatch<T>? TryMatch(StringSegment segment) public ParsedMatch<TValue>? TryMatch(TContext context, StringSegment segment)
{ {
var index = segment.Source.IndexOf(needle, segment.StartIndex, segment.Length, comparison); var index = segment.Source.IndexOf(needle, segment.StartIndex, segment.Length, comparison);
@ -19,8 +19,8 @@ internal class StringMatcher<T>(
return null; return null;
var segmentMatch = segment.Relocate(index, needle.Length); var segmentMatch = segment.Relocate(index, needle.Length);
var value = transform(segmentMatch); var value = transform(context, segmentMatch);
return value is not null ? new ParsedMatch<T>(segmentMatch, value) : null; return value is not null ? new ParsedMatch<TValue>(segmentMatch, value) : null;
} }
} }