Headline
CVE-2023-28631: Merge pull request from GHSA-5r3x-p7xx-x6q5 · kivikakk/comrak@9ff5f8d
comrak is a CommonMark + GFM compatible Markdown parser and renderer written in rust. A Comrak AST can be constructed manually by a program instead of parsing a Markdown document with parse_document
. This AST can then be converted to HTML via html::format_document_with_plugins
. However, the HTML formatting code assumes that the AST is well-formed. For example, many AST notes contain [u8]
fields which the formatting code assumes is valid UTF-8 data. Several bugs can be triggered if this is not the case. Version 0.17.0 contains adjustments to the AST, storing strings instead of unvalidated byte arrays. Users are advised to upgrade. Users unable to upgrade may manually validate UTF-8 correctness of all data when assigning to &[u8]
and Vec<u8>
fields in the AST. This issue is also tracked as GHSL-2023-049
.
@@ -9,7 +9,7 @@ use crate::parser::shortcodes::NodeShortCode; use crate::parser::ComrakOptions; use crate::scanners; use crate::{nodes, ComrakPlugins}; use std;
use std::cmp::max; use std::io::{self, Write};
@@ -58,7 +58,7 @@ struct CommonMarkFormatter<’a, ‘o> { enum Escaping { Literal, Normal, URL, Url, Title, }
@@ -209,7 +209,7 @@ impl<’a, ‘o> CommonMarkFormatter<’a, ‘o> { && (c == b’.’ || c == b’)') && follows_digit && (nextc == 0 || isspace(nextc))))) || (escaping == Escaping::URL || (escaping == Escaping::Url && (c == b’`’ || c == b’<’ || c == b’>’ @@ -221,7 +221,7 @@ impl<’a, ‘o> CommonMarkFormatter<’a, ‘o> { && (c == b’`’ || c == b’<’ || c == b’>’ || c == b’"’ || c == b’\\’)));
if needs_escaping { if escaping == Escaping::URL && isspace© { if escaping == Escaping::Url && isspace© { write!(self.v, “%{:2X}", c).unwrap(); self.column += 3; } else if ispunct© { @@ -310,7 +310,7 @@ impl<’a, 'o> CommonMarkFormatter<’a, 'o> {
match node.data.borrow().value { NodeValue::Document => (), NodeValue::FrontMatter(ref fm) => self.format_front_matter(fm, entering), NodeValue::FrontMatter(ref fm) => self.format_front_matter(fm.as_bytes(), entering), NodeValue::BlockQuote => self.format_block_quote(entering), NodeValue::List(…) => self.format_list(node, entering), NodeValue::Item(…) => self.format_item(node, entering), @@ -323,16 +323,20 @@ impl<’a, ‘o> CommonMarkFormatter<’a, ‘o> { NodeValue::HtmlBlock(ref nhb) => self.format_html_block(nhb, entering), NodeValue::ThematicBreak => self.format_thematic_break(entering), NodeValue::Paragraph => self.format_paragraph(entering), NodeValue::Text(ref literal) => self.format_text(literal, allow_wrap, entering), NodeValue::Text(ref literal) => { self.format_text(literal.as_bytes(), allow_wrap, entering) } NodeValue::LineBreak => self.format_line_break(entering), NodeValue::SoftBreak => self.format_soft_break(allow_wrap, entering), NodeValue::Code(ref code) => self.format_code(&code.literal, allow_wrap, entering), NodeValue::HtmlInline(ref literal) => self.format_html_inline(literal, entering), NodeValue::Code(ref code) => { self.format_code(code.literal.as_bytes(), allow_wrap, entering) } NodeValue::HtmlInline(ref literal) => { self.format_html_inline(literal.as_bytes(), entering) } NodeValue::Strong => self.format_strong(), NodeValue::Emph => self.format_emph(node), NodeValue::TaskItem { checked, symbol } => { self.format_task_item(checked, symbol, entering) } NodeValue::TaskItem { symbol } => self.format_task_item(symbol, entering), NodeValue::Strikethrough => self.format_strikethrough(), NodeValue::Superscript => self.format_superscript(), NodeValue::Link(ref nl) => return self.format_link(node, nl, entering), @@ -343,12 +347,14 @@ impl<’a, ‘o> CommonMarkFormatter<’a, ‘o> { NodeValue::TableRow(…) => self.format_table_row(entering), NodeValue::TableCell => self.format_table_cell(node, entering), NodeValue::FootnoteDefinition(_) => self.format_footnote_definition(entering), NodeValue::FootnoteReference(ref r) => self.format_footnote_reference(r, entering), NodeValue::FootnoteReference(ref r) => { self.format_footnote_reference(r.as_bytes(), entering) } }; true }
fn format_front_matter(&mut self, front_matter: &Vec<u8>, entering: bool) { fn format_front_matter(&mut self, front_matter: &[u8], entering: bool) { if entering { self.output(front_matter, false, Escaping::Literal); } @@ -467,30 +473,33 @@ impl<’a, ‘o> CommonMarkFormatter<’a, ‘o> { self.blankline(); }
if ncb.info.is_empty() && (ncb.literal.len() > 2 && !isspace(ncb.literal[0]) && !(isspace(ncb.literal[ncb.literal.len() - 1]) && isspace(ncb.literal[ncb.literal.len() - 2]))) let info = ncb.info.as_bytes(); let literal = ncb.literal.as_bytes();
if info.is_empty() && (literal.len() > 2 && !isspace(literal[0]) && !(isspace(literal[literal.len() - 1]) && isspace(literal[literal.len() - 2]))) && !first_in_list_item { write!(self, " ").unwrap(); write!(self.prefix, " ").unwrap(); self.write_all(&ncb.literal).unwrap(); self.write_all(literal).unwrap(); let new_len = self.prefix.len() - 4; self.prefix.truncate(new_len); } else { let fence_char = if ncb.info.contains(&b’`’) { b’~’ } else { b’`’ }; let numticks = max(3, longest_char_sequence(&ncb.literal, fence_char) + 1); let fence_char = if info.contains(&b’`’) { b’~’ } else { b’`’ }; let numticks = max(3, longest_char_sequence(literal, fence_char) + 1); for _ in 0…numticks { write!(self, “{}", fence_char as char).unwrap(); } if !ncb.info.is_empty() { if !info.is_empty() { write!(self, " ").unwrap(); self.write_all(&ncb.info).unwrap(); self.write_all(info).unwrap(); } self.cr(); self.write_all(&ncb.literal).unwrap(); self.write_all(literal).unwrap(); self.cr(); for _ in 0…numticks { write!(self, “{}", fence_char as char).unwrap(); @@ -503,7 +512,7 @@ impl<’a, 'o> CommonMarkFormatter<’a, 'o> { fn format_html_block(&mut self, nhb: &NodeHtmlBlock, entering: bool) { if entering { self.blankline(); self.write_all(&nhb.literal).unwrap(); self.write_all(nhb.literal.as_bytes()).unwrap(); self.blankline(); } } @@ -522,7 +531,7 @@ impl<’a, 'o> CommonMarkFormatter<’a, 'o> { } }
fn format_text(&mut self, literal: &Vec<u8>, allow_wrap: bool, entering: bool) { fn format_text(&mut self, literal: &[u8], allow_wrap: bool, entering: bool) { if entering { self.output(literal, allow_wrap, Escaping::Normal); } @@ -550,7 +559,7 @@ impl<’a, 'o> CommonMarkFormatter<’a, ‘o> { } }
fn format_code(&mut self, literal: &Vec<u8>, allow_wrap: bool, entering: bool) { fn format_code(&mut self, literal: &[u8], allow_wrap: bool, entering: bool) { if entering { let numticks = shortest_unused_sequence(literal, b’`’); for _ in 0…numticks { @@ -577,7 +586,7 @@ impl<’a, 'o> CommonMarkFormatter<’a, 'o> { } }
fn format_html_inline(&mut self, literal: &Vec<u8>, entering: bool) { fn format_html_inline(&mut self, literal: &[u8], entering: bool) { if entering { self.write_all(literal).unwrap(); } @@ -602,9 +611,9 @@ impl<’a, 'o> CommonMarkFormatter<’a, ‘o> { self.write_all(&[emph_delim]).unwrap(); }
fn format_task_item(&mut self, _checked: bool, symbol: u8, entering: bool) { fn format_task_item(&mut self, symbol: Option<char>, entering: bool) { if entering { write!(self, "[{}] ", symbol as char).unwrap(); write!(self, "[{}] “, symbol.unwrap_or(' ')).unwrap(); } }
@@ -619,40 +628,34 @@ impl<’a, 'o> CommonMarkFormatter<’a, 'o> { fn format_link(&mut self, node: &’a AstNode<’a>, nl: &NodeLink, entering: bool) -> bool { if is_autolink(node, nl) { if entering { write!(self, “<”).unwrap(); if nl.url.len() >= 7 && &nl.url[…7] == b"mailto:” { self.write_all(&nl.url[7…]).unwrap(); } else { self.write_all(&nl.url).unwrap(); } write!(self, “>”).unwrap(); write!(self, “<{}>", nl.url.trim_start_matches(“mailto:”)).unwrap(); return false; } } else if entering { write!(self, "[").unwrap(); } else { write!(self, "](").unwrap(); self.output(&nl.url, false, Escaping::URL); self.output(nl.url.as_bytes(), false, Escaping::Url); if !nl.title.is_empty() { write!(self, " \"”).unwrap(); self.output(&nl.title, false, Escaping::Title); self.output(nl.title.as_bytes(), false, Escaping::Title); write!(self, “\"”).unwrap(); } write!(self, ")").unwrap(); }
return true; true }
fn format_image(&mut self, nl: &NodeLink, allow_wrap: bool, entering: bool) { if entering { write!(self, ".unwrap(); self.output(&nl.url, false, Escaping::URL); self.output(nl.url.as_bytes(), false, Escaping::Url); if !nl.title.is_empty() { self.output(&[b’ ‘, b’"’], allow_wrap, Escaping::Literal); self.output(&nl.title, false, Escaping::Title); self.output(nl.title.as_bytes(), false, Escaping::Title); write!(self, “\"”).unwrap(); } write!(self, ")").unwrap(); @@ -737,7 +740,7 @@ impl<’a, 'o> CommonMarkFormatter<’a, 'o> { } }
fn format_footnote_reference(&mut self, r: &Vec<u8>, entering: bool) { fn format_footnote_reference(&mut self, r: &[u8], entering: bool) { if entering { self.write_all(b”[^”).unwrap(); self.write_all®.unwrap(); @@ -792,7 +795,7 @@ fn shortest_unused_sequence(literal: &[u8], f: u8) -> usize { }
fn is_autolink<’a>(node: &’a AstNode<’a>, nl: &NodeLink) -> bool { if nl.url.is_empty() || scanners::scheme(&nl.url).is_none() { if nl.url.is_empty() || scanners::scheme(nl.url.as_bytes()).is_none() { return false; }
@@ -808,12 +811,7 @@ fn is_autolink<’a>(node: &’a AstNode<’a>, nl: &NodeLink) -> bool { }, };
let mut real_url: &[u8] = &nl.url; if real_url.len() >= 7 && &real_url[…7] == b"mailto:” { real_url = &real_url[7…]; }
real_url == &*link_text nl.url.trim_start_matches(“mailto:”) == link_text }
fn table_escape<’a>(node: &’a AstNode<’a>, c: u8) -> bool {
Related news
### Impact A Comrak AST can be constructed manually by a program instead of parsing a Markdown document with `parse_document`. This AST can then be converted to HTML via `html::format_document_with_plugins`. However, the HTML formatting code assumes that the AST is well-formed. For example, many AST notes contain `[u8]` fields which the formatting code assumes is valid UTF-8 data. Several bugs can be triggered if this is not the case. ### Patches 0.17.0 contains adjustments to the AST, storing strings instead of unvalidated byte arrays. ### Workarounds * Validate UTF-8 correctness of all data when assigning to `&[u8]` and `Vec<u8>` fields in the AST. ### References n/a