diff --git a/DiscordChatExporter.Core/Markdown/Parsing/AggregateMatcher.cs b/DiscordChatExporter.Core/Markdown/Parsing/AggregateMatcher.cs index 6a3e6e7c..aa6597bc 100644 --- a/DiscordChatExporter.Core/Markdown/Parsing/AggregateMatcher.cs +++ b/DiscordChatExporter.Core/Markdown/Parsing/AggregateMatcher.cs @@ -2,20 +2,22 @@ namespace DiscordChatExporter.Core.Markdown.Parsing; -internal class AggregateMatcher(IReadOnlyList> matchers) : IMatcher +internal class AggregateMatcher( + IReadOnlyList> matchers +) : IMatcher { - public AggregateMatcher(params IMatcher[] matchers) - : this((IReadOnlyList>)matchers) { } + public AggregateMatcher(params IMatcher[] matchers) + : this((IReadOnlyList>)matchers) { } - public ParsedMatch? TryMatch(StringSegment segment) + public ParsedMatch? TryMatch(TContext context, StringSegment segment) { - ParsedMatch? earliestMatch = null; + ParsedMatch? earliestMatch = null; // Try to match the input with each matcher and get the match with the lowest start index foreach (var matcher in matchers) { // Try to match - var match = matcher.TryMatch(segment); + var match = matcher.TryMatch(context, segment); // If there's no match - continue if (match is null) diff --git a/DiscordChatExporter.Core/Markdown/Parsing/IMatcher.cs b/DiscordChatExporter.Core/Markdown/Parsing/IMatcher.cs index 4c71c0a0..d1da5feb 100644 --- a/DiscordChatExporter.Core/Markdown/Parsing/IMatcher.cs +++ b/DiscordChatExporter.Core/Markdown/Parsing/IMatcher.cs @@ -3,17 +3,18 @@ using System.Collections.Generic; namespace DiscordChatExporter.Core.Markdown.Parsing; -internal interface IMatcher +internal interface IMatcher { - ParsedMatch? TryMatch(StringSegment segment); + ParsedMatch? TryMatch(TContext context, StringSegment segment); } internal static class MatcherExtensions { - public static IEnumerable> MatchAll( - this IMatcher matcher, + public static IEnumerable> MatchAll( + this IMatcher matcher, + TContext context, StringSegment segment, - Func transformFallback + Func transformFallback ) { // Loop through segments divided by individual matches @@ -22,6 +23,7 @@ internal static class MatcherExtensions { // Find a match within this segment var match = matcher.TryMatch( + context, segment.Relocate(currentIndex, segment.EndIndex - currentIndex) ); @@ -36,9 +38,9 @@ internal static class MatcherExtensions match.Segment.StartIndex - currentIndex ); - yield return new ParsedMatch( + yield return new ParsedMatch( fallbackSegment, - transformFallback(fallbackSegment) + transformFallback(context, fallbackSegment) ); } @@ -53,7 +55,10 @@ internal static class MatcherExtensions { var fallbackSegment = segment.Relocate(currentIndex, segment.EndIndex - currentIndex); - yield return new ParsedMatch(fallbackSegment, transformFallback(fallbackSegment)); + yield return new ParsedMatch( + fallbackSegment, + transformFallback(context, fallbackSegment) + ); } } } diff --git a/DiscordChatExporter.Core/Markdown/Parsing/MarkdownContext.cs b/DiscordChatExporter.Core/Markdown/Parsing/MarkdownContext.cs new file mode 100644 index 00000000..6e213411 --- /dev/null +++ b/DiscordChatExporter.Core/Markdown/Parsing/MarkdownContext.cs @@ -0,0 +1,3 @@ +namespace DiscordChatExporter.Core.Markdown.Parsing; + +internal readonly record struct MarkdownContext(int Depth = 0); diff --git a/DiscordChatExporter.Core/Markdown/Parsing/MarkdownParser.cs b/DiscordChatExporter.Core/Markdown/Parsing/MarkdownParser.cs index 28b813af..87399f92 100644 --- a/DiscordChatExporter.Core/Markdown/Parsing/MarkdownParser.cs +++ b/DiscordChatExporter.Core/Markdown/Parsing/MarkdownParser.cs @@ -23,15 +23,15 @@ internal static partial class MarkdownParser /* Formatting */ - private static readonly IMatcher BoldFormattingNodeMatcher = - new RegexMatcher( + private static readonly IMatcher BoldFormattingNodeMatcher = + new RegexMatcher( // There must be exactly two closing asterisks. new Regex(@"\*\*(.+?)\*\*(?!\*)", DefaultRegexOptions | RegexOptions.Singleline), - (s, m) => new FormattingNode(FormattingKind.Bold, Parse(s.Relocate(m.Groups[1]))) + (c, s, m) => new FormattingNode(FormattingKind.Bold, Parse(c, s.Relocate(m.Groups[1]))) ); - private static readonly IMatcher ItalicFormattingNodeMatcher = - new RegexMatcher( + private static readonly IMatcher ItalicFormattingNodeMatcher = + new RegexMatcher( // There must be exactly one closing asterisk. // Opening asterisk must not be followed by whitespace. // Closing asterisk must not be preceded by whitespace. @@ -39,156 +39,174 @@ internal static partial class MarkdownParser @"\*(?!\s)(.+?)(? new FormattingNode(FormattingKind.Italic, Parse(s.Relocate(m.Groups[1]))) + (c, s, m) => + new FormattingNode(FormattingKind.Italic, Parse(c, s.Relocate(m.Groups[1]))) ); - private static readonly IMatcher ItalicBoldFormattingNodeMatcher = - new RegexMatcher( - // There must be exactly three closing asterisks. - new Regex(@"\*(\*\*.+?\*\*)\*(?!\*)", DefaultRegexOptions | RegexOptions.Singleline), - (s, m) => - new FormattingNode( - FormattingKind.Italic, - Parse(s.Relocate(m.Groups[1]), BoldFormattingNodeMatcher) - ) - ); - - private static readonly IMatcher ItalicAltFormattingNodeMatcher = - new RegexMatcher( - // Closing underscore must not be followed by a word character. - new Regex(@"_(.+?)_(?!\w)", DefaultRegexOptions | RegexOptions.Singleline), - (s, m) => new FormattingNode(FormattingKind.Italic, Parse(s.Relocate(m.Groups[1]))) - ); - - private static readonly IMatcher UnderlineFormattingNodeMatcher = - new RegexMatcher( - // There must be exactly two closing underscores. - new Regex(@"__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline), - (s, m) => new FormattingNode(FormattingKind.Underline, Parse(s.Relocate(m.Groups[1]))) - ); - - private static readonly IMatcher ItalicUnderlineFormattingNodeMatcher = - new RegexMatcher( - // There must be exactly three closing underscores. - new Regex(@"_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline), - (s, m) => - new FormattingNode( - FormattingKind.Italic, - Parse(s.Relocate(m.Groups[1]), UnderlineFormattingNodeMatcher) - ) - ); - - private static readonly IMatcher StrikethroughFormattingNodeMatcher = - new RegexMatcher( - new Regex(@"~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline), - (s, m) => - new FormattingNode(FormattingKind.Strikethrough, Parse(s.Relocate(m.Groups[1]))) - ); - - private static readonly IMatcher SpoilerFormattingNodeMatcher = - new RegexMatcher( - new Regex(@"\|\|(.+?)\|\|", DefaultRegexOptions | RegexOptions.Singleline), - (s, m) => new FormattingNode(FormattingKind.Spoiler, Parse(s.Relocate(m.Groups[1]))) - ); - - private static readonly IMatcher SingleLineQuoteNodeMatcher = - new RegexMatcher( - // Include the linebreak in the content so that the lines are preserved in quotes. - new Regex(@"^>\s(.+\n?)", DefaultRegexOptions), - (s, m) => new FormattingNode(FormattingKind.Quote, Parse(s.Relocate(m.Groups[1]))) - ); - - private static readonly IMatcher RepeatedSingleLineQuoteNodeMatcher = - new RegexMatcher( - // Include the linebreaks in the content, so that the lines are preserved in quotes. - // Empty content is allowed within quotes. - // https://github.com/Tyrrrz/DiscordChatExporter/issues/1115 - new Regex(@"(?:^>\s(.*\n?)){2,}", DefaultRegexOptions), - (s, m) => - new FormattingNode( - FormattingKind.Quote, - m.Groups[1].Captures.SelectMany(c => Parse(s.Relocate(c))).ToArray() - ) - ); - - private static readonly IMatcher MultiLineQuoteNodeMatcher = - new RegexMatcher( - new Regex(@"^>>>\s(.+)", DefaultRegexOptions | RegexOptions.Singleline), - (s, m) => new FormattingNode(FormattingKind.Quote, Parse(s.Relocate(m.Groups[1]))) - ); - - private static readonly IMatcher HeadingNodeMatcher = - new RegexMatcher( - // Consume the linebreak so that it's not attached to following nodes. - new Regex(@"^(\#{1,3})\s(.+)\n", DefaultRegexOptions), - (s, m) => new HeadingNode(m.Groups[1].Length, Parse(s.Relocate(m.Groups[2]))) - ); - - private static readonly IMatcher ListNodeMatcher = new RegexMatcher( - // Can be preceded by whitespace, which specifies the list's nesting level. - // Following lines that start with (level+1) whitespace are considered part of the list item. - // Consume the linebreak so that it's not attached to following nodes. - new Regex(@"^(\s*)(?:[\-\*]\s(.+(?:\n\s\1.*)*)?\n?)+", DefaultRegexOptions), - (s, m) => - new ListNode( - m.Groups[2].Captures.Select(c => new ListItemNode(Parse(s.Relocate(c)))).ToArray() + private static readonly IMatcher< + MarkdownContext, + MarkdownNode + > ItalicBoldFormattingNodeMatcher = new RegexMatcher( + // There must be exactly three closing asterisks. + new Regex(@"\*(\*\*.+?\*\*)\*(?!\*)", DefaultRegexOptions | RegexOptions.Singleline), + (c, s, m) => + new FormattingNode( + FormattingKind.Italic, + Parse(c, s.Relocate(m.Groups[1]), BoldFormattingNodeMatcher) ) ); - /* Code blocks */ - - private static readonly IMatcher InlineCodeBlockNodeMatcher = - new RegexMatcher( - // One or two backticks are allowed, but they must match on both sides. - new Regex(@"(`{1,2})([^`]+)\1", DefaultRegexOptions | RegexOptions.Singleline), - (_, m) => new InlineCodeBlockNode(m.Groups[2].Value) + private static readonly IMatcher ItalicAltFormattingNodeMatcher = + new RegexMatcher( + // Closing underscore must not be followed by a word character. + new Regex(@"_(.+?)_(?!\w)", DefaultRegexOptions | RegexOptions.Singleline), + (c, s, m) => + new FormattingNode(FormattingKind.Italic, Parse(c, s.Relocate(m.Groups[1]))) ); - private static readonly IMatcher MultiLineCodeBlockNodeMatcher = - new RegexMatcher( + private static readonly IMatcher UnderlineFormattingNodeMatcher = + new RegexMatcher( + // There must be exactly two closing underscores. + new Regex(@"__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline), + (c, s, m) => + new FormattingNode(FormattingKind.Underline, Parse(c, s.Relocate(m.Groups[1]))) + ); + + private static readonly IMatcher< + MarkdownContext, + MarkdownNode + > ItalicUnderlineFormattingNodeMatcher = new RegexMatcher( + // There must be exactly three closing underscores. + new Regex(@"_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline), + (c, s, m) => + new FormattingNode( + FormattingKind.Italic, + Parse(c, s.Relocate(m.Groups[1]), UnderlineFormattingNodeMatcher) + ) + ); + + private static readonly IMatcher< + MarkdownContext, + MarkdownNode + > StrikethroughFormattingNodeMatcher = new RegexMatcher( + new Regex(@"~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline), + (c, s, m) => + new FormattingNode(FormattingKind.Strikethrough, Parse(c, s.Relocate(m.Groups[1]))) + ); + + private static readonly IMatcher SpoilerFormattingNodeMatcher = + new RegexMatcher( + new Regex(@"\|\|(.+?)\|\|", DefaultRegexOptions | RegexOptions.Singleline), + (c, s, m) => + new FormattingNode(FormattingKind.Spoiler, Parse(c, s.Relocate(m.Groups[1]))) + ); + + private static readonly IMatcher SingleLineQuoteNodeMatcher = + new RegexMatcher( + // Include the linebreak in the content so that the lines are preserved in quotes. + new Regex(@"^>\s(.+\n?)", DefaultRegexOptions), + (c, s, m) => new FormattingNode(FormattingKind.Quote, Parse(c, s.Relocate(m.Groups[1]))) + ); + + private static readonly IMatcher< + MarkdownContext, + MarkdownNode + > RepeatedSingleLineQuoteNodeMatcher = new RegexMatcher( + // Include the linebreaks in the content, so that the lines are preserved in quotes. + // Empty content is allowed within quotes. + // https://github.com/Tyrrrz/DiscordChatExporter/issues/1115 + new Regex(@"(?:^>\s(.*\n?)){2,}", DefaultRegexOptions), + (c, s, m) => + new FormattingNode( + FormattingKind.Quote, + m.Groups[1].Captures.SelectMany(r => Parse(c, s.Relocate(r))).ToArray() + ) + ); + + private static readonly IMatcher MultiLineQuoteNodeMatcher = + new RegexMatcher( + new Regex(@"^>>>\s(.+)", DefaultRegexOptions | RegexOptions.Singleline), + (c, s, m) => new FormattingNode(FormattingKind.Quote, Parse(c, s.Relocate(m.Groups[1]))) + ); + + private static readonly IMatcher HeadingNodeMatcher = + new RegexMatcher( + // Consume the linebreak so that it's not attached to following nodes. + new Regex(@"^(\#{1,3})\s(.+)\n", DefaultRegexOptions), + (c, s, m) => new HeadingNode(m.Groups[1].Length, Parse(c, s.Relocate(m.Groups[2]))) + ); + + private static readonly IMatcher ListNodeMatcher = + new RegexMatcher( + // Can be preceded by whitespace, which specifies the list's nesting level. + // Following lines that start with (level+1) whitespace are considered part of the list item. + // Consume the linebreak so that it's not attached to following nodes. + new Regex(@"^(\s*)(?:[\-\*]\s(.+(?:\n\s\1.*)*)?\n?)+", DefaultRegexOptions), + (c, s, m) => + new ListNode( + m.Groups[2] + .Captures.Select(x => new ListItemNode(Parse(c, s.Relocate(x)))) + .ToArray() + ) + ); + + /* Code blocks */ + + private static readonly IMatcher InlineCodeBlockNodeMatcher = + new RegexMatcher( + // One or two backticks are allowed, but they must match on both sides. + new Regex(@"(`{1,2})([^`]+)\1", DefaultRegexOptions | RegexOptions.Singleline), + (_, _, m) => new InlineCodeBlockNode(m.Groups[2].Value) + ); + + private static readonly IMatcher MultiLineCodeBlockNodeMatcher = + new RegexMatcher( // Language identifier is one word immediately after opening backticks, followed immediately by a linebreak. // Blank lines at the beginning and at the end of content are trimmed. new Regex(@"```(?:(\w*)\n)?(.+?)```", DefaultRegexOptions | RegexOptions.Singleline), - (_, m) => + (_, _, m) => new MultiLineCodeBlockNode(m.Groups[1].Value, m.Groups[2].Value.Trim('\r', '\n')) ); /* Mentions */ - private static readonly IMatcher EveryoneMentionNodeMatcher = - new StringMatcher( + private static readonly IMatcher EveryoneMentionNodeMatcher = + new StringMatcher( "@everyone", - _ => new MentionNode(null, MentionKind.Everyone) + (_, _) => new MentionNode(null, MentionKind.Everyone) ); - private static readonly IMatcher HereMentionNodeMatcher = - new StringMatcher("@here", _ => new MentionNode(null, MentionKind.Here)); + private static readonly IMatcher HereMentionNodeMatcher = + new StringMatcher( + "@here", + (_, _) => new MentionNode(null, MentionKind.Here) + ); - private static readonly IMatcher UserMentionNodeMatcher = - new RegexMatcher( + private static readonly IMatcher UserMentionNodeMatcher = + new RegexMatcher( // Capture <@123456> or <@!123456> new Regex(@"<@!?(\d+)>", DefaultRegexOptions), - (_, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.User) + (_, _, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.User) ); - private static readonly IMatcher ChannelMentionNodeMatcher = - new RegexMatcher( + private static readonly IMatcher ChannelMentionNodeMatcher = + new RegexMatcher( // Capture <#123456> new Regex(@"<\#!?(\d+)>", DefaultRegexOptions), - (_, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.Channel) + (_, _, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.Channel) ); - private static readonly IMatcher RoleMentionNodeMatcher = - new RegexMatcher( + private static readonly IMatcher RoleMentionNodeMatcher = + new RegexMatcher( // Capture <@&123456> new Regex(@"<@&(\d+)>", DefaultRegexOptions), - (_, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.Role) + (_, _, m) => new MentionNode(Snowflake.TryParse(m.Groups[1].Value), MentionKind.Role) ); /* Emoji */ - private static readonly IMatcher StandardEmojiNodeMatcher = - new RegexMatcher( + private static readonly IMatcher StandardEmojiNodeMatcher = + new RegexMatcher( new Regex( @"(" + @@ -239,21 +257,21 @@ internal static partial class MarkdownParser + @")", DefaultRegexOptions ), - (_, m) => new EmojiNode(m.Groups[1].Value) + (_, _, m) => new EmojiNode(m.Groups[1].Value) ); - private static readonly IMatcher CodedStandardEmojiNodeMatcher = - new RegexMatcher( + private static readonly IMatcher CodedStandardEmojiNodeMatcher = + new RegexMatcher( // Capture :thinking: new Regex(@":([\w_]+):", DefaultRegexOptions), - (_, m) => EmojiIndex.TryGetName(m.Groups[1].Value)?.Pipe(n => new EmojiNode(n)) + (_, _, m) => EmojiIndex.TryGetName(m.Groups[1].Value)?.Pipe(n => new EmojiNode(n)) ); - private static readonly IMatcher CustomEmojiNodeMatcher = - new RegexMatcher( + private static readonly IMatcher CustomEmojiNodeMatcher = + new RegexMatcher( // Capture <:lul:123456> or new Regex(@"<(a)?:(.+?):(\d+?)>", DefaultRegexOptions), - (_, m) => + (_, _, m) => new EmojiNode( Snowflake.TryParse(m.Groups[3].Value), m.Groups[2].Value, @@ -263,70 +281,72 @@ internal static partial class MarkdownParser /* Links */ - private static readonly IMatcher AutoLinkNodeMatcher = - new RegexMatcher( + private static readonly IMatcher AutoLinkNodeMatcher = + new RegexMatcher( // Any non-whitespace character after http:// or https:// // until the last punctuation character or whitespace. new Regex(@"(https?://\S*[^\.,:;""'\s])", DefaultRegexOptions), - (_, m) => new LinkNode(m.Groups[1].Value) + (_, _, m) => new LinkNode(m.Groups[1].Value) ); - private static readonly IMatcher HiddenLinkNodeMatcher = - new RegexMatcher( + private static readonly IMatcher HiddenLinkNodeMatcher = + new RegexMatcher( // Same as auto link but also surrounded by angular brackets new Regex(@"<(https?://\S*[^\.,:;""'\s])>", DefaultRegexOptions), - (_, m) => new LinkNode(m.Groups[1].Value) + (_, _, m) => new LinkNode(m.Groups[1].Value) ); - private static readonly IMatcher MaskedLinkNodeMatcher = - new RegexMatcher( + private static readonly IMatcher MaskedLinkNodeMatcher = + new RegexMatcher( // Capture [title](link) new Regex(@"\[(.+?)\]\((.+?)\)", DefaultRegexOptions), - (s, m) => new LinkNode(m.Groups[2].Value, Parse(s.Relocate(m.Groups[1]))) + (c, s, m) => new LinkNode(m.Groups[2].Value, Parse(c, s.Relocate(m.Groups[1]))) ); /* Text */ - private static readonly IMatcher ShrugTextNodeMatcher = - new StringMatcher( + private static readonly IMatcher ShrugTextNodeMatcher = + new StringMatcher( // Capture the shrug kaomoji. // This escapes it from matching for formatting. @"¯\_(ツ)_/¯", - s => new TextNode(s.ToString()) + (s, _) => new TextNode(s.ToString()) ); - private static readonly IMatcher IgnoredEmojiTextNodeMatcher = - new RegexMatcher( + private static readonly IMatcher IgnoredEmojiTextNodeMatcher = + new RegexMatcher( // Capture some specific emoji that don't get rendered. // This escapes them from matching for emoji. new Regex(@"([\u26A7\u2640\u2642\u2695\u267E\u00A9\u00AE\u2122])", DefaultRegexOptions), - (_, m) => new TextNode(m.Groups[1].Value) + (_, _, m) => new TextNode(m.Groups[1].Value) ); - private static readonly IMatcher EscapedSymbolTextNodeMatcher = - new RegexMatcher( + private static readonly IMatcher EscapedSymbolTextNodeMatcher = + new RegexMatcher( // Capture any "symbol/other" character or surrogate pair preceded by a backslash. // This escapes them from matching for emoji. // https://github.com/Tyrrrz/DiscordChatExporter/issues/230 new Regex(@"\\(\p{So}|\p{Cs}{2})", DefaultRegexOptions), - (_, m) => new TextNode(m.Groups[1].Value) + (_, _, m) => new TextNode(m.Groups[1].Value) ); - private static readonly IMatcher EscapedCharacterTextNodeMatcher = - new RegexMatcher( - // Capture any non-whitespace, non latin alphanumeric character preceded by a backslash. - // This escapes them from matching for formatting or other tokens. - new Regex(@"\\([^a-zA-Z0-9\s])", DefaultRegexOptions), - (_, m) => new TextNode(m.Groups[1].Value) - ); + private static readonly IMatcher< + MarkdownContext, + MarkdownNode + > EscapedCharacterTextNodeMatcher = new RegexMatcher( + // Capture any non-whitespace, non latin alphanumeric character preceded by a backslash. + // This escapes them from matching for formatting or other tokens. + new Regex(@"\\([^a-zA-Z0-9\s])", DefaultRegexOptions), + (_, _, m) => new TextNode(m.Groups[1].Value) + ); /* Misc */ - private static readonly IMatcher TimestampNodeMatcher = - new RegexMatcher( + private static readonly IMatcher TimestampNodeMatcher = + new RegexMatcher( // Capture or new Regex(@"", DefaultRegexOptions), - (_, m) => + (_, _, m) => { try { @@ -382,50 +402,51 @@ internal static partial class MarkdownParser ); // Matchers that have similar patterns are ordered from most specific to least specific - private static readonly IMatcher NodeMatcher = new AggregateMatcher( - // Escaped text - ShrugTextNodeMatcher, - IgnoredEmojiTextNodeMatcher, - EscapedSymbolTextNodeMatcher, - EscapedCharacterTextNodeMatcher, - // Formatting - ItalicBoldFormattingNodeMatcher, - ItalicUnderlineFormattingNodeMatcher, - BoldFormattingNodeMatcher, - ItalicFormattingNodeMatcher, - UnderlineFormattingNodeMatcher, - ItalicAltFormattingNodeMatcher, - StrikethroughFormattingNodeMatcher, - SpoilerFormattingNodeMatcher, - MultiLineQuoteNodeMatcher, - RepeatedSingleLineQuoteNodeMatcher, - SingleLineQuoteNodeMatcher, - HeadingNodeMatcher, - ListNodeMatcher, - // Code blocks - MultiLineCodeBlockNodeMatcher, - InlineCodeBlockNodeMatcher, - // Mentions - EveryoneMentionNodeMatcher, - HereMentionNodeMatcher, - UserMentionNodeMatcher, - ChannelMentionNodeMatcher, - RoleMentionNodeMatcher, - // Links - MaskedLinkNodeMatcher, - AutoLinkNodeMatcher, - HiddenLinkNodeMatcher, - // Emoji - StandardEmojiNodeMatcher, - CustomEmojiNodeMatcher, - CodedStandardEmojiNodeMatcher, - // Misc - TimestampNodeMatcher - ); + private static readonly IMatcher NodeMatcher = + new AggregateMatcher( + // Escaped text + ShrugTextNodeMatcher, + IgnoredEmojiTextNodeMatcher, + EscapedSymbolTextNodeMatcher, + EscapedCharacterTextNodeMatcher, + // Formatting + ItalicBoldFormattingNodeMatcher, + ItalicUnderlineFormattingNodeMatcher, + BoldFormattingNodeMatcher, + ItalicFormattingNodeMatcher, + UnderlineFormattingNodeMatcher, + ItalicAltFormattingNodeMatcher, + StrikethroughFormattingNodeMatcher, + SpoilerFormattingNodeMatcher, + MultiLineQuoteNodeMatcher, + RepeatedSingleLineQuoteNodeMatcher, + SingleLineQuoteNodeMatcher, + HeadingNodeMatcher, + ListNodeMatcher, + // Code blocks + MultiLineCodeBlockNodeMatcher, + InlineCodeBlockNodeMatcher, + // Mentions + EveryoneMentionNodeMatcher, + HereMentionNodeMatcher, + UserMentionNodeMatcher, + ChannelMentionNodeMatcher, + RoleMentionNodeMatcher, + // Links + MaskedLinkNodeMatcher, + AutoLinkNodeMatcher, + HiddenLinkNodeMatcher, + // Emoji + StandardEmojiNodeMatcher, + CustomEmojiNodeMatcher, + CodedStandardEmojiNodeMatcher, + // Misc + TimestampNodeMatcher + ); // Minimal set of matchers for non-multimedia formats (e.g. plain text) - private static readonly IMatcher MinimalNodeMatcher = - new AggregateMatcher( + private static readonly IMatcher MinimalNodeMatcher = + new AggregateMatcher( // Mentions EveryoneMentionNodeMatcher, HereMentionNodeMatcher, @@ -439,24 +460,46 @@ internal static partial class MarkdownParser ); private static IReadOnlyList Parse( + MarkdownContext context, StringSegment segment, - IMatcher matcher - ) => matcher.MatchAll(segment, s => new TextNode(s.ToString())).Select(r => r.Value).ToArray(); + IMatcher matcher + ) + { + // Limit recursion depth to a reasonable number to prevent + // stack overflow on messages with inadvertently deep nesting. + // Example: ********************************* (repeat ad nauseam) + // https://github.com/Tyrrrz/DiscordChatExporter/issues/1214 + if (context.Depth >= 32) + return [new TextNode(segment.ToString())]; + + return matcher + .MatchAll( + new MarkdownContext(context.Depth + 1), + segment, + (_, s) => new TextNode(s.ToString()) + ) + .Select(r => r.Value) + .ToArray(); + } } internal static partial class MarkdownParser { - private static IReadOnlyList Parse(StringSegment segment) => - Parse(segment, NodeMatcher); + private static IReadOnlyList Parse( + MarkdownContext context, + StringSegment segment + ) => Parse(context, segment, NodeMatcher); public static IReadOnlyList Parse(string markdown) => - Parse(new StringSegment(markdown)); + Parse(new MarkdownContext(), new StringSegment(markdown)); - private static IReadOnlyList ParseMinimal(StringSegment segment) => - Parse(segment, MinimalNodeMatcher); + private static IReadOnlyList ParseMinimal( + MarkdownContext context, + StringSegment segment + ) => Parse(context, segment, MinimalNodeMatcher); public static IReadOnlyList ParseMinimal(string markdown) => - ParseMinimal(new StringSegment(markdown)); + ParseMinimal(new MarkdownContext(), new StringSegment(markdown)); private static void ExtractLinks(IEnumerable nodes, ICollection links) { diff --git a/DiscordChatExporter.Core/Markdown/Parsing/RegexMatcher.cs b/DiscordChatExporter.Core/Markdown/Parsing/RegexMatcher.cs index 1fa6288b..34eb4139 100644 --- a/DiscordChatExporter.Core/Markdown/Parsing/RegexMatcher.cs +++ b/DiscordChatExporter.Core/Markdown/Parsing/RegexMatcher.cs @@ -3,9 +3,12 @@ using System.Text.RegularExpressions; namespace DiscordChatExporter.Core.Markdown.Parsing; -internal class RegexMatcher(Regex regex, Func transform) : IMatcher +internal class RegexMatcher( + Regex regex, + Func transform +) : IMatcher { - public ParsedMatch? TryMatch(StringSegment segment) + public ParsedMatch? TryMatch(TContext context, StringSegment segment) { var match = regex.Match(segment.Source, segment.StartIndex, segment.Length); if (!match.Success) @@ -20,8 +23,8 @@ internal class RegexMatcher(Regex regex, Func trans return null; var segmentMatch = segment.Relocate(match); - var value = transform(segmentMatch, match); + var value = transform(context, segmentMatch, match); - return value is not null ? new ParsedMatch(segmentMatch, value) : null; + return value is not null ? new ParsedMatch(segmentMatch, value) : null; } } diff --git a/DiscordChatExporter.Core/Markdown/Parsing/StringMatcher.cs b/DiscordChatExporter.Core/Markdown/Parsing/StringMatcher.cs index 881046c9..bd336c48 100644 --- a/DiscordChatExporter.Core/Markdown/Parsing/StringMatcher.cs +++ b/DiscordChatExporter.Core/Markdown/Parsing/StringMatcher.cs @@ -2,16 +2,16 @@ namespace DiscordChatExporter.Core.Markdown.Parsing; -internal class StringMatcher( +internal class StringMatcher( string needle, StringComparison comparison, - Func transform -) : IMatcher + Func transform +) : IMatcher { - public StringMatcher(string needle, Func transform) + public StringMatcher(string needle, Func transform) : this(needle, StringComparison.Ordinal, transform) { } - public ParsedMatch? TryMatch(StringSegment segment) + public ParsedMatch? TryMatch(TContext context, StringSegment segment) { var index = segment.Source.IndexOf(needle, segment.StartIndex, segment.Length, comparison); @@ -19,8 +19,8 @@ internal class StringMatcher( return null; var segmentMatch = segment.Relocate(index, needle.Length); - var value = transform(segmentMatch); + var value = transform(context, segmentMatch); - return value is not null ? new ParsedMatch(segmentMatch, value) : null; + return value is not null ? new ParsedMatch(segmentMatch, value) : null; } }