Limit recursion depth in markdown parser to prevent stack overflow (#1273)
Some checks failed
docker / pack (push) Has been cancelled
docker / deploy (push) Has been cancelled
main / format (push) Has been cancelled
main / test (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, linux-arm) (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, linux-arm64) (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, linux-musl-x64) (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, linux-x64) (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, osx-arm64) (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, osx-x64) (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, win-arm64) (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, win-x64) (push) Has been cancelled
main / pack (DiscordChatExporter.Cli, DiscordChatExporter.Cli, win-x86) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, linux-arm) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, linux-arm64) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, linux-musl-x64) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, linux-x64) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, osx-arm64) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, osx-x64) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, win-arm64) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, win-x64) (push) Has been cancelled
main / pack (DiscordChatExporter.Gui, DiscordChatExporter, win-x86) (push) Has been cancelled
main / release (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, linux-arm) (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, linux-arm64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, linux-musl-x64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, linux-x64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, osx-arm64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, osx-x64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, win-arm64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, win-x64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Cli, DiscordChatExporter.Cli, win-x86) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, linux-arm) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, linux-arm64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, linux-musl-x64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, linux-x64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, osx-arm64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, osx-x64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, win-arm64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, win-x64) (push) Has been cancelled
main / deploy (DiscordChatExporter.Gui, DiscordChatExporter, win-x86) (push) Has been cancelled
main / notify (push) Has been cancelled

This commit is contained in:
Oleksii Holub 2024-08-14 23:52:03 +03:00 committed by GitHub
parent 10adba3a4d
commit 522789e01d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 288 additions and 232 deletions

View file

@ -2,20 +2,22 @@
namespace DiscordChatExporter.Core.Markdown.Parsing;
internal class AggregateMatcher<T>(IReadOnlyList<IMatcher<T>> matchers) : IMatcher<T>
internal class AggregateMatcher<TContext, TValue>(
IReadOnlyList<IMatcher<TContext, TValue>> matchers
) : IMatcher<TContext, TValue>
{
public AggregateMatcher(params IMatcher<T>[] matchers)
: this((IReadOnlyList<IMatcher<T>>)matchers) { }
public AggregateMatcher(params IMatcher<TContext, TValue>[] matchers)
: this((IReadOnlyList<IMatcher<TContext, TValue>>)matchers) { }
public ParsedMatch<T>? TryMatch(StringSegment segment)
public ParsedMatch<TValue>? TryMatch(TContext context, StringSegment segment)
{
ParsedMatch<T>? earliestMatch = null;
ParsedMatch<TValue>? earliestMatch = null;
// Try to match the input with each matcher and get the match with the lowest start index
foreach (var matcher in matchers)
{
// Try to match
var match = matcher.TryMatch(segment);
var match = matcher.TryMatch(context, segment);
// If there's no match - continue
if (match is null)

View file

@ -3,17 +3,18 @@ using System.Collections.Generic;
namespace DiscordChatExporter.Core.Markdown.Parsing;
internal interface IMatcher<T>
internal interface IMatcher<in TContext, TValue>
{
ParsedMatch<T>? TryMatch(StringSegment segment);
ParsedMatch<TValue>? TryMatch(TContext context, StringSegment segment);
}
internal static class MatcherExtensions
{
public static IEnumerable<ParsedMatch<T>> MatchAll<T>(
this IMatcher<T> matcher,
public static IEnumerable<ParsedMatch<TValue>> MatchAll<TContext, TValue>(
this IMatcher<TContext, TValue> matcher,
TContext context,
StringSegment segment,
Func<StringSegment, T> transformFallback
Func<TContext, StringSegment, TValue> transformFallback
)
{
// Loop through segments divided by individual matches
@ -22,6 +23,7 @@ internal static class MatcherExtensions
{
// Find a match within this segment
var match = matcher.TryMatch(
context,
segment.Relocate(currentIndex, segment.EndIndex - currentIndex)
);
@ -36,9 +38,9 @@ internal static class MatcherExtensions
match.Segment.StartIndex - currentIndex
);
yield return new ParsedMatch<T>(
yield return new ParsedMatch<TValue>(
fallbackSegment,
transformFallback(fallbackSegment)
transformFallback(context, fallbackSegment)
);
}
@ -53,7 +55,10 @@ internal static class MatcherExtensions
{
var fallbackSegment = segment.Relocate(currentIndex, segment.EndIndex - currentIndex);
yield return new ParsedMatch<T>(fallbackSegment, transformFallback(fallbackSegment));
yield return new ParsedMatch<TValue>(
fallbackSegment,
transformFallback(context, fallbackSegment)
);
}
}
}

View file

@ -0,0 +1,3 @@
namespace DiscordChatExporter.Core.Markdown.Parsing;
internal readonly record struct MarkdownContext(int Depth = 0);

View file

@ -23,15 +23,15 @@ internal static partial class MarkdownParser
/* Formatting */
private static readonly IMatcher<MarkdownNode> BoldFormattingNodeMatcher =
new RegexMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> BoldFormattingNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// There must be exactly two closing asterisks.
new Regex(@"\*\*(.+?)\*\*(?!\*)", DefaultRegexOptions | RegexOptions.Singleline),
(s, m) => new FormattingNode(FormattingKind.Bold, Parse(s.Relocate(m.Groups[1])))
(c, s, m) => new FormattingNode(FormattingKind.Bold, Parse(c, s.Relocate(m.Groups[1])))
);
private static readonly IMatcher<MarkdownNode> ItalicFormattingNodeMatcher =
new RegexMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> ItalicFormattingNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// There must be exactly one closing asterisk.
// Opening asterisk must not be followed by whitespace.
// Closing asterisk must not be preceded by whitespace.
@ -39,156 +39,174 @@ internal static partial class MarkdownParser
@"\*(?!\s)(.+?)(?<!\s|\*)\*(?!\*)",
DefaultRegexOptions | RegexOptions.Singleline
),
(s, m) => new FormattingNode(FormattingKind.Italic, Parse(s.Relocate(m.Groups[1])))
(c, s, m) =>
new FormattingNode(FormattingKind.Italic, Parse(c, s.Relocate(m.Groups[1])))
);
private static readonly IMatcher<MarkdownNode> ItalicBoldFormattingNodeMatcher =
new RegexMatcher<MarkdownNode>(
// There must be exactly three closing asterisks.
new Regex(@"\*(\*\*.+?\*\*)\*(?!\*)", DefaultRegexOptions | RegexOptions.Singleline),
(s, m) =>
new FormattingNode(
FormattingKind.Italic,
Parse(s.Relocate(m.Groups[1]), BoldFormattingNodeMatcher)
)
);
private static readonly IMatcher<MarkdownNode> ItalicAltFormattingNodeMatcher =
new RegexMatcher<MarkdownNode>(
// Closing underscore must not be followed by a word character.
new Regex(@"_(.+?)_(?!\w)", DefaultRegexOptions | RegexOptions.Singleline),
(s, m) => new FormattingNode(FormattingKind.Italic, Parse(s.Relocate(m.Groups[1])))
);
private static readonly IMatcher<MarkdownNode> UnderlineFormattingNodeMatcher =
new RegexMatcher<MarkdownNode>(
// There must be exactly two closing underscores.
new Regex(@"__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
(s, m) => new FormattingNode(FormattingKind.Underline, Parse(s.Relocate(m.Groups[1])))
);
private static readonly IMatcher<MarkdownNode> ItalicUnderlineFormattingNodeMatcher =
new RegexMatcher<MarkdownNode>(
// There must be exactly three closing underscores.
new Regex(@"_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
(s, m) =>
new FormattingNode(
FormattingKind.Italic,
Parse(s.Relocate(m.Groups[1]), UnderlineFormattingNodeMatcher)
)
);
private static readonly IMatcher<MarkdownNode> StrikethroughFormattingNodeMatcher =
new RegexMatcher<MarkdownNode>(
new Regex(@"~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline),
(s, m) =>
new FormattingNode(FormattingKind.Strikethrough, Parse(s.Relocate(m.Groups[1])))
);
private static readonly IMatcher<MarkdownNode> SpoilerFormattingNodeMatcher =
new RegexMatcher<MarkdownNode>(
new Regex(@"\|\|(.+?)\|\|", DefaultRegexOptions | RegexOptions.Singleline),
(s, m) => new FormattingNode(FormattingKind.Spoiler, Parse(s.Relocate(m.Groups[1])))
);
private static readonly IMatcher<MarkdownNode> SingleLineQuoteNodeMatcher =
new RegexMatcher<MarkdownNode>(
// Include the linebreak in the content so that the lines are preserved in quotes.
new Regex(@"^>\s(.+\n?)", DefaultRegexOptions),
(s, m) => new FormattingNode(FormattingKind.Quote, Parse(s.Relocate(m.Groups[1])))
);
private static readonly IMatcher<MarkdownNode> RepeatedSingleLineQuoteNodeMatcher =
new RegexMatcher<MarkdownNode>(
// Include the linebreaks in the content, so that the lines are preserved in quotes.
// Empty content is allowed within quotes.
// https://github.com/Tyrrrz/DiscordChatExporter/issues/1115
new Regex(@"(?:^>\s(.*\n?)){2,}", DefaultRegexOptions),
(s, m) =>
new FormattingNode(
FormattingKind.Quote,
m.Groups[1].Captures.SelectMany(c => Parse(s.Relocate(c))).ToArray()
)
);
private static readonly IMatcher<MarkdownNode> MultiLineQuoteNodeMatcher =
new RegexMatcher<MarkdownNode>(
new Regex(@"^>>>\s(.+)", DefaultRegexOptions | RegexOptions.Singleline),
(s, m) => new FormattingNode(FormattingKind.Quote, Parse(s.Relocate(m.Groups[1])))
);
private static readonly IMatcher<MarkdownNode> HeadingNodeMatcher =
new RegexMatcher<MarkdownNode>(
// Consume the linebreak so that it's not attached to following nodes.
new Regex(@"^(\#{1,3})\s(.+)\n", DefaultRegexOptions),
(s, m) => new HeadingNode(m.Groups[1].Length, Parse(s.Relocate(m.Groups[2])))
);
private static readonly IMatcher<MarkdownNode> ListNodeMatcher = new RegexMatcher<MarkdownNode>(
// Can be preceded by whitespace, which specifies the list's nesting level.
// Following lines that start with (level+1) whitespace are considered part of the list item.
// Consume the linebreak so that it's not attached to following nodes.
new Regex(@"^(\s*)(?:[\-\*]\s(.+(?:\n\s\1.*)*)?\n?)+", DefaultRegexOptions),
(s, m) =>
new ListNode(
m.Groups[2].Captures.Select(c => new ListItemNode(Parse(s.Relocate(c)))).ToArray()
private static readonly IMatcher<
MarkdownContext,
MarkdownNode
> ItalicBoldFormattingNodeMatcher = new RegexMatcher<MarkdownContext, MarkdownNode>(
// There must be exactly three closing asterisks.
new Regex(@"\*(\*\*.+?\*\*)\*(?!\*)", DefaultRegexOptions | RegexOptions.Singleline),
(c, s, m) =>
new FormattingNode(
FormattingKind.Italic,
Parse(c, s.Relocate(m.Groups[1]), BoldFormattingNodeMatcher)
)
);
/* Code blocks */
private static readonly IMatcher<MarkdownNode> InlineCodeBlockNodeMatcher =
new RegexMatcher<MarkdownNode>(
// One or two backticks are allowed, but they must match on both sides.
new Regex(@"(`{1,2})([^`]+)\1", DefaultRegexOptions | RegexOptions.Singleline),
(_, m) => new InlineCodeBlockNode(m.Groups[2].Value)
private static readonly IMatcher<MarkdownContext, MarkdownNode> ItalicAltFormattingNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// Closing underscore must not be followed by a word character.
new Regex(@"_(.+?)_(?!\w)", DefaultRegexOptions | RegexOptions.Singleline),
(c, s, m) =>
new FormattingNode(FormattingKind.Italic, Parse(c, s.Relocate(m.Groups[1])))
);
private static readonly IMatcher<MarkdownNode> MultiLineCodeBlockNodeMatcher =
new RegexMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> UnderlineFormattingNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// There must be exactly two closing underscores.
new Regex(@"__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
(c, s, m) =>
new FormattingNode(FormattingKind.Underline, Parse(c, s.Relocate(m.Groups[1])))
);
private static readonly IMatcher<
MarkdownContext,
MarkdownNode
> ItalicUnderlineFormattingNodeMatcher = new RegexMatcher<MarkdownContext, MarkdownNode>(
// There must be exactly three closing underscores.
new Regex(@"_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
(c, s, m) =>
new FormattingNode(
FormattingKind.Italic,
Parse(c, s.Relocate(m.Groups[1]), UnderlineFormattingNodeMatcher)
)
);
private static readonly IMatcher<
MarkdownContext,
MarkdownNode
> StrikethroughFormattingNodeMatcher = new RegexMatcher<MarkdownContext, MarkdownNode>(
new Regex(@"~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline),
(c, s, m) =>
new FormattingNode(FormattingKind.Strikethrough, Parse(c, s.Relocate(m.Groups[1])))
);
private static readonly IMatcher<MarkdownContext, MarkdownNode> SpoilerFormattingNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
new Regex(@"\|\|(.+?)\|\|", DefaultRegexOptions | RegexOptions.Singleline),
(c, s, m) =>
new FormattingNode(FormattingKind.Spoiler, Parse(c, s.Relocate(m.Groups[1])))
);
private static readonly IMatcher<MarkdownContext, MarkdownNode> SingleLineQuoteNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// Include the linebreak in the content so that the lines are preserved in quotes.
new Regex(@"^>\s(.+\n?)", DefaultRegexOptions),
(c, s, m) => new FormattingNode(FormattingKind.Quote, Parse(c, s.Relocate(m.Groups[1])))
);
private static readonly IMatcher<
MarkdownContext,
MarkdownNode
> RepeatedSingleLineQuoteNodeMatcher = new RegexMatcher<MarkdownContext, MarkdownNode>(
// Include the linebreaks in the content, so that the lines are preserved in quotes.
// Empty content is allowed within quotes.
// https://github.com/Tyrrrz/DiscordChatExporter/issues/1115
new Regex(@"(?:^>\s(.*\n?)){2,}", DefaultRegexOptions),
(c, s, m) =>
new FormattingNode(
FormattingKind.Quote,
m.Groups[1].Captures.SelectMany(r => Parse(c, s.Relocate(r))).ToArray()
)
);
private static readonly IMatcher<MarkdownContext, MarkdownNode> MultiLineQuoteNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
new Regex(@"^>>>\s(.+)", DefaultRegexOptions | RegexOptions.Singleline),
(c, s, m) => new FormattingNode(FormattingKind.Quote, Parse(c, s.Relocate(m.Groups[1])))
);
private static readonly IMatcher<MarkdownContext, MarkdownNode> HeadingNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// Consume the linebreak so that it's not attached to following nodes.
new Regex(@"^(\#{1,3})\s(.+)\n", DefaultRegexOptions),
(c, s, m) => new HeadingNode(m.Groups[1].Length, Parse(c, s.Relocate(m.Groups[2])))
);
private static readonly IMatcher<MarkdownContext, MarkdownNode> ListNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// Can be preceded by whitespace, which specifies the list's nesting level.
// Following lines that start with (level+1) whitespace are considered part of the list item.
// Consume the linebreak so that it's not attached to following nodes.
new Regex(@"^(\s*)(?:[\-\*]\s(.+(?:\n\s\1.*)*)?\n?)+", DefaultRegexOptions),
(c, s, m) =>
new ListNode(
m.Groups[2]
.Captures.Select(x => new ListItemNode(Parse(c, s.Relocate(x))))
.ToArray()
)
);
/* Code blocks */
private static readonly IMatcher<MarkdownContext, MarkdownNode> InlineCodeBlockNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// One or two backticks are allowed, but they must match on both sides.
new Regex(@"(`{1,2})([^`]+)\1", DefaultRegexOptions | RegexOptions.Singleline),
(_, _, m) => new InlineCodeBlockNode(m.Groups[2].Value)
);
private static readonly IMatcher<MarkdownContext, MarkdownNode> MultiLineCodeBlockNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// Language identifier is one word immediately after opening backticks, followed immediately by a linebreak.
// Blank lines at the beginning and at the end of content are trimmed.
new Regex(@"```(?:(\w*)\n)?(.+?)```", DefaultRegexOptions | RegexOptions.Singleline),
(_, m) =>
(_, _, m) =>
new MultiLineCodeBlockNode(m.Groups[1].Value, m.Groups[2].Value.Trim('\r', '\n'))
);
/* Mentions */
private static readonly IMatcher<MarkdownNode> EveryoneMentionNodeMatcher =
new StringMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> EveryoneMentionNodeMatcher =
new StringMatcher<MarkdownContext, MarkdownNode>(
"@everyone",
_ => new MentionNode(null, MentionKind.Everyone)
(_, _) => new MentionNode(null, MentionKind.Everyone)
);
private static readonly IMatcher<MarkdownNode> HereMentionNodeMatcher =
new StringMatcher<MarkdownNode>("@here", _ => new MentionNode(null, MentionKind.Here));
private static readonly IMatcher<MarkdownContext, MarkdownNode> HereMentionNodeMatcher =
new StringMatcher<MarkdownContext, MarkdownNode>(
"@here",
(_, _) => new MentionNode(null, MentionKind.Here)
);
private static readonly IMatcher<MarkdownNode> UserMentionNodeMatcher =
new RegexMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> UserMentionNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture <@123456> or <@!123456>
new Regex(@"<@!?(\d+)>", DefaultRegexOptions),
(_, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.User)
(_, _, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.User)
);
private static readonly IMatcher<MarkdownNode> ChannelMentionNodeMatcher =
new RegexMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> ChannelMentionNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture <#123456>
new Regex(@"<\#!?(\d+)>", DefaultRegexOptions),
(_, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.Channel)
(_, _, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.Channel)
);
private static readonly IMatcher<MarkdownNode> RoleMentionNodeMatcher =
new RegexMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> RoleMentionNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture <@&123456>
new Regex(@"<@&(\d+)>", DefaultRegexOptions),
(_, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.Role)
(_, _, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.Role)
);
/* Emoji */
private static readonly IMatcher<MarkdownNode> StandardEmojiNodeMatcher =
new RegexMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> StandardEmojiNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
new Regex(
@"("
+
@ -239,21 +257,21 @@ internal static partial class MarkdownParser
+ @")",
DefaultRegexOptions
),
(_, m) => new EmojiNode(m.Groups[1].Value)
(_, _, m) => new EmojiNode(m.Groups[1].Value)
);
private static readonly IMatcher<MarkdownNode> CodedStandardEmojiNodeMatcher =
new RegexMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> CodedStandardEmojiNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture :thinking:
new Regex(@":([\w_]+):", DefaultRegexOptions),
(_, m) => EmojiIndex.TryGetName(m.Groups[1].Value)?.Pipe(n => new EmojiNode(n))
(_, _, m) => EmojiIndex.TryGetName(m.Groups[1].Value)?.Pipe(n => new EmojiNode(n))
);
private static readonly IMatcher<MarkdownNode> CustomEmojiNodeMatcher =
new RegexMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> CustomEmojiNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture <:lul:123456> or <a:lul:123456>
new Regex(@"<(a)?:(.+?):(\d+?)>", DefaultRegexOptions),
(_, m) =>
(_, _, m) =>
new EmojiNode(
Snowflake.TryParse(m.Groups[3].Value),
m.Groups[2].Value,
@ -263,70 +281,72 @@ internal static partial class MarkdownParser
/* Links */
private static readonly IMatcher<MarkdownNode> AutoLinkNodeMatcher =
new RegexMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> AutoLinkNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// Any non-whitespace character after http:// or https://
// until the last punctuation character or whitespace.
new Regex(@"(https?://\S*[^\.,:;""'\s])", DefaultRegexOptions),
(_, m) => new LinkNode(m.Groups[1].Value)
(_, _, m) => new LinkNode(m.Groups[1].Value)
);
private static readonly IMatcher<MarkdownNode> HiddenLinkNodeMatcher =
new RegexMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> HiddenLinkNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// Same as auto link but also surrounded by angular brackets
new Regex(@"<(https?://\S*[^\.,:;""'\s])>", DefaultRegexOptions),
(_, m) => new LinkNode(m.Groups[1].Value)
(_, _, m) => new LinkNode(m.Groups[1].Value)
);
private static readonly IMatcher<MarkdownNode> MaskedLinkNodeMatcher =
new RegexMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> MaskedLinkNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture [title](link)
new Regex(@"\[(.+?)\]\((.+?)\)", DefaultRegexOptions),
(s, m) => new LinkNode(m.Groups[2].Value, Parse(s.Relocate(m.Groups[1])))
(c, s, m) => new LinkNode(m.Groups[2].Value, Parse(c, s.Relocate(m.Groups[1])))
);
/* Text */
private static readonly IMatcher<MarkdownNode> ShrugTextNodeMatcher =
new StringMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> ShrugTextNodeMatcher =
new StringMatcher<MarkdownContext, MarkdownNode>(
// Capture the shrug kaomoji.
// This escapes it from matching for formatting.
@"¯\_(ツ)_/¯",
s => new TextNode(s.ToString())
(s, _) => new TextNode(s.ToString())
);
private static readonly IMatcher<MarkdownNode> IgnoredEmojiTextNodeMatcher =
new RegexMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> IgnoredEmojiTextNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture some specific emoji that don't get rendered.
// This escapes them from matching for emoji.
new Regex(@"([\u26A7\u2640\u2642\u2695\u267E\u00A9\u00AE\u2122])", DefaultRegexOptions),
(_, m) => new TextNode(m.Groups[1].Value)
(_, _, m) => new TextNode(m.Groups[1].Value)
);
private static readonly IMatcher<MarkdownNode> EscapedSymbolTextNodeMatcher =
new RegexMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> EscapedSymbolTextNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture any "symbol/other" character or surrogate pair preceded by a backslash.
// This escapes them from matching for emoji.
// https://github.com/Tyrrrz/DiscordChatExporter/issues/230
new Regex(@"\\(\p{So}|\p{Cs}{2})", DefaultRegexOptions),
(_, m) => new TextNode(m.Groups[1].Value)
(_, _, m) => new TextNode(m.Groups[1].Value)
);
private static readonly IMatcher<MarkdownNode> EscapedCharacterTextNodeMatcher =
new RegexMatcher<MarkdownNode>(
// Capture any non-whitespace, non latin alphanumeric character preceded by a backslash.
// This escapes them from matching for formatting or other tokens.
new Regex(@"\\([^a-zA-Z0-9\s])", DefaultRegexOptions),
(_, m) => new TextNode(m.Groups[1].Value)
);
private static readonly IMatcher<
MarkdownContext,
MarkdownNode
> EscapedCharacterTextNodeMatcher = new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture any non-whitespace, non latin alphanumeric character preceded by a backslash.
// This escapes them from matching for formatting or other tokens.
new Regex(@"\\([^a-zA-Z0-9\s])", DefaultRegexOptions),
(_, _, m) => new TextNode(m.Groups[1].Value)
);
/* Misc */
private static readonly IMatcher<MarkdownNode> TimestampNodeMatcher =
new RegexMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> TimestampNodeMatcher =
new RegexMatcher<MarkdownContext, MarkdownNode>(
// Capture <t:12345678> or <t:12345678:R>
new Regex(@"<t:(-?\d+)(?::(\w))?>", DefaultRegexOptions),
(_, m) =>
(_, _, m) =>
{
try
{
@ -382,50 +402,51 @@ internal static partial class MarkdownParser
);
// Matchers that have similar patterns are ordered from most specific to least specific
private static readonly IMatcher<MarkdownNode> NodeMatcher = new AggregateMatcher<MarkdownNode>(
// Escaped text
ShrugTextNodeMatcher,
IgnoredEmojiTextNodeMatcher,
EscapedSymbolTextNodeMatcher,
EscapedCharacterTextNodeMatcher,
// Formatting
ItalicBoldFormattingNodeMatcher,
ItalicUnderlineFormattingNodeMatcher,
BoldFormattingNodeMatcher,
ItalicFormattingNodeMatcher,
UnderlineFormattingNodeMatcher,
ItalicAltFormattingNodeMatcher,
StrikethroughFormattingNodeMatcher,
SpoilerFormattingNodeMatcher,
MultiLineQuoteNodeMatcher,
RepeatedSingleLineQuoteNodeMatcher,
SingleLineQuoteNodeMatcher,
HeadingNodeMatcher,
ListNodeMatcher,
// Code blocks
MultiLineCodeBlockNodeMatcher,
InlineCodeBlockNodeMatcher,
// Mentions
EveryoneMentionNodeMatcher,
HereMentionNodeMatcher,
UserMentionNodeMatcher,
ChannelMentionNodeMatcher,
RoleMentionNodeMatcher,
// Links
MaskedLinkNodeMatcher,
AutoLinkNodeMatcher,
HiddenLinkNodeMatcher,
// Emoji
StandardEmojiNodeMatcher,
CustomEmojiNodeMatcher,
CodedStandardEmojiNodeMatcher,
// Misc
TimestampNodeMatcher
);
private static readonly IMatcher<MarkdownContext, MarkdownNode> NodeMatcher =
new AggregateMatcher<MarkdownContext, MarkdownNode>(
// Escaped text
ShrugTextNodeMatcher,
IgnoredEmojiTextNodeMatcher,
EscapedSymbolTextNodeMatcher,
EscapedCharacterTextNodeMatcher,
// Formatting
ItalicBoldFormattingNodeMatcher,
ItalicUnderlineFormattingNodeMatcher,
BoldFormattingNodeMatcher,
ItalicFormattingNodeMatcher,
UnderlineFormattingNodeMatcher,
ItalicAltFormattingNodeMatcher,
StrikethroughFormattingNodeMatcher,
SpoilerFormattingNodeMatcher,
MultiLineQuoteNodeMatcher,
RepeatedSingleLineQuoteNodeMatcher,
SingleLineQuoteNodeMatcher,
HeadingNodeMatcher,
ListNodeMatcher,
// Code blocks
MultiLineCodeBlockNodeMatcher,
InlineCodeBlockNodeMatcher,
// Mentions
EveryoneMentionNodeMatcher,
HereMentionNodeMatcher,
UserMentionNodeMatcher,
ChannelMentionNodeMatcher,
RoleMentionNodeMatcher,
// Links
MaskedLinkNodeMatcher,
AutoLinkNodeMatcher,
HiddenLinkNodeMatcher,
// Emoji
StandardEmojiNodeMatcher,
CustomEmojiNodeMatcher,
CodedStandardEmojiNodeMatcher,
// Misc
TimestampNodeMatcher
);
// Minimal set of matchers for non-multimedia formats (e.g. plain text)
private static readonly IMatcher<MarkdownNode> MinimalNodeMatcher =
new AggregateMatcher<MarkdownNode>(
private static readonly IMatcher<MarkdownContext, MarkdownNode> MinimalNodeMatcher =
new AggregateMatcher<MarkdownContext, MarkdownNode>(
// Mentions
EveryoneMentionNodeMatcher,
HereMentionNodeMatcher,
@ -439,24 +460,46 @@ internal static partial class MarkdownParser
);
private static IReadOnlyList<MarkdownNode> Parse(
MarkdownContext context,
StringSegment segment,
IMatcher<MarkdownNode> matcher
) => matcher.MatchAll(segment, s => new TextNode(s.ToString())).Select(r => r.Value).ToArray();
IMatcher<MarkdownContext, MarkdownNode> matcher
)
{
// Limit recursion depth to a reasonable number to prevent
// stack overflow on messages with inadvertently deep nesting.
// Example: ********************************* (repeat ad nauseam)
// https://github.com/Tyrrrz/DiscordChatExporter/issues/1214
if (context.Depth >= 32)
return [new TextNode(segment.ToString())];
return matcher
.MatchAll(
new MarkdownContext(context.Depth + 1),
segment,
(_, s) => new TextNode(s.ToString())
)
.Select(r => r.Value)
.ToArray();
}
}
internal static partial class MarkdownParser
{
private static IReadOnlyList<MarkdownNode> Parse(StringSegment segment) =>
Parse(segment, NodeMatcher);
private static IReadOnlyList<MarkdownNode> Parse(
MarkdownContext context,
StringSegment segment
) => Parse(context, segment, NodeMatcher);
public static IReadOnlyList<MarkdownNode> Parse(string markdown) =>
Parse(new StringSegment(markdown));
Parse(new MarkdownContext(), new StringSegment(markdown));
private static IReadOnlyList<MarkdownNode> ParseMinimal(StringSegment segment) =>
Parse(segment, MinimalNodeMatcher);
private static IReadOnlyList<MarkdownNode> ParseMinimal(
MarkdownContext context,
StringSegment segment
) => Parse(context, segment, MinimalNodeMatcher);
public static IReadOnlyList<MarkdownNode> ParseMinimal(string markdown) =>
ParseMinimal(new StringSegment(markdown));
ParseMinimal(new MarkdownContext(), new StringSegment(markdown));
private static void ExtractLinks(IEnumerable<MarkdownNode> nodes, ICollection<LinkNode> links)
{

View file

@ -3,9 +3,12 @@ using System.Text.RegularExpressions;
namespace DiscordChatExporter.Core.Markdown.Parsing;
internal class RegexMatcher<T>(Regex regex, Func<StringSegment, Match, T?> transform) : IMatcher<T>
internal class RegexMatcher<TContext, TValue>(
Regex regex,
Func<TContext, StringSegment, Match, TValue?> transform
) : IMatcher<TContext, TValue>
{
public ParsedMatch<T>? TryMatch(StringSegment segment)
public ParsedMatch<TValue>? TryMatch(TContext context, StringSegment segment)
{
var match = regex.Match(segment.Source, segment.StartIndex, segment.Length);
if (!match.Success)
@ -20,8 +23,8 @@ internal class RegexMatcher<T>(Regex regex, Func<StringSegment, Match, T?> trans
return null;
var segmentMatch = segment.Relocate(match);
var value = transform(segmentMatch, match);
var value = transform(context, segmentMatch, match);
return value is not null ? new ParsedMatch<T>(segmentMatch, value) : null;
return value is not null ? new ParsedMatch<TValue>(segmentMatch, value) : null;
}
}

View file

@ -2,16 +2,16 @@
namespace DiscordChatExporter.Core.Markdown.Parsing;
internal class StringMatcher<T>(
internal class StringMatcher<TContext, TValue>(
string needle,
StringComparison comparison,
Func<StringSegment, T?> transform
) : IMatcher<T>
Func<TContext, StringSegment, TValue?> transform
) : IMatcher<TContext, TValue>
{
public StringMatcher(string needle, Func<StringSegment, T> transform)
public StringMatcher(string needle, Func<TContext, StringSegment, TValue> transform)
: this(needle, StringComparison.Ordinal, transform) { }
public ParsedMatch<T>? TryMatch(StringSegment segment)
public ParsedMatch<TValue>? TryMatch(TContext context, StringSegment segment)
{
var index = segment.Source.IndexOf(needle, segment.StartIndex, segment.Length, comparison);
@ -19,8 +19,8 @@ internal class StringMatcher<T>(
return null;
var segmentMatch = segment.Relocate(index, needle.Length);
var value = transform(segmentMatch);
var value = transform(context, segmentMatch);
return value is not null ? new ParsedMatch<T>(segmentMatch, value) : null;
return value is not null ? new ParsedMatch<TValue>(segmentMatch, value) : null;
}
}