From f13ca4d486762c3f3e90ff7256a52e2cc503f674 Mon Sep 17 00:00:00 2001 From: Matt Wiebe Date: Thu, 6 Feb 2014 16:45:17 -0600 Subject: [PATCH] Markdown: preserve all data in code blocks * Ensure that KSES doesn't strip things early * Prettify things when sending back to the editor --- _inc/lib/markdown/gfm.php | 61 +++++++++++++++++++++++++++--- modules/markdown/easy-markdown.php | 48 ++++++++++++++++++++--- 2 files changed, 97 insertions(+), 12 deletions(-) diff --git a/_inc/lib/markdown/gfm.php b/_inc/lib/markdown/gfm.php index 680b8f571131a..4a9e0a24ad469 100644 --- a/_inc/lib/markdown/gfm.php +++ b/_inc/lib/markdown/gfm.php @@ -6,9 +6,7 @@ * @author Matt Wiebe * @link https://github.com/evansolomon/wp-github-flavored-markdown-comments * - * Add a few extras from GitHub's Markdown implementation. Must be used - * in a WordPress environment if the $preserve_shortcodes member is set to true, - * which will be auto-detected initially on __construct() + * Add a few extras from GitHub's Markdown implementation. Must be used in a WordPress environment. */ class WPCom_GHF_Markdown_Parser extends MarkdownExtra_Parser { @@ -102,6 +100,51 @@ public function transform( $text ) { return $text; } + /** + * Preserve code block contents by HTML encoding them. Useful before getting to KSES stripping. + * @param string $text Markdown/HTML content + * @return string Markdown/HTML content with escaped code blocks + */ + public function codeblock_preserve( $text ) { + $text = preg_replace_callback( "/^(`{3})([^`\n]+)?\n([^`~]+)(`{3})/m", array( $this, 'do_codeblock_preserve' ), $text ); + $text = preg_replace_callback( "/^(~{3})([^~\n]+)?\n([^~~]+)(~{3})/m", array( $this, 'do_codeblock_preserve' ), $text ); + return $text; + } + + /** + * Regex callback for code block preservation. + * @param array $matches Regex matches + * @return string Codeblock with escaped interior + */ + public function do_codeblock_preserve( $matches ) { + $block = stripslashes( $matches[3] ); + $block = esc_html( $block ); + $open = $matches[1] . $matches[2] . "\n"; + return $open . $block . $matches[4]; + } + + /** + * Restore previously preserved (i.e. escaped) code block contents. + * @param string $text Markdown/HTML content with escaped code blocks + * @return string Markdown/HTML content + */ + public function codeblock_restore( $text ) { + $text = preg_replace_callback( "/^(`{3})([^`\n]+)?\n([^`~]+)(`{3})/m", array( $this, 'do_codeblock_restore' ), $text ); + $text = preg_replace_callback( "/^(~{3})([^~\n]+)?\n([^~~]+)(~{3})/m", array( $this, 'do_codeblock_restore' ), $text ); + return $text; + } + + /** + * Regex callback for code block restoration (unescaping). + * @param array $matches Regex matches + * @return string Codeblock with unescaped interior + */ + public function do_codeblock_restore( $matches ) { + $block = html_entity_decode( $matches[3] ); + $open = $matches[1] . $matches[2] . "\n"; + return $open . $block . $matches[4]; + } + /** * Called to preserve legacy LaTeX like $latex some-latex-text $ * @param string $text Text in which to preserve LaTeX @@ -254,10 +297,15 @@ public function _doEscapeForHashWithoutSpacing( $m ) { * Overload to support Viper's [code] shortcode. Because awesome. */ public function _doFencedCodeBlocks_callback( $matches ) { - // just MarkdownExtra_Parser if we're not going ultra-deluxe, or if - // there wasn't a language class passed - if ( ! $this->use_code_shortcode || empty( $matches[2] ) ) + // just MarkdownExtra_Parser if we're not going ultra-deluxe + if ( ! $this->use_code_shortcode ) { return parent::_doFencedCodeBlocks_callback( $matches ); + } + + // default to a "text" class if one wasn't passed. Helps with encoding issues later. + if ( empty( $matches[2] ) ) { + $matches[2] = 'text'; + } $classname =& $matches[2]; $codeblock = preg_replace_callback('/^\n+/', array( $this, '_doFencedCodeBlocks_newlines' ), $matches[4] ); @@ -265,6 +313,7 @@ public function _doFencedCodeBlocks_callback( $matches ) { if ( $classname{0} == '.' ) $classname = substr( $classname, 1 ); + $codeblock = esc_html( $codeblock ); $codeblock = sprintf( $this->shortcode_start, $classname ) . "\n{$codeblock}" . $this->shortcode_end; return "\n\n" . $this->hashBlock( $codeblock ). "\n\n"; } diff --git a/modules/markdown/easy-markdown.php b/modules/markdown/easy-markdown.php index 69ebb30207dc5..12f3dae39d501 100644 --- a/modules/markdown/easy-markdown.php +++ b/modules/markdown/easy-markdown.php @@ -99,6 +99,7 @@ public function load_markdown_for_posts() { add_action( 'wp_restore_post_revision', array( $this, 'wp_restore_post_revision' ), 10, 2 ); add_filter( '_wp_post_revision_fields', array( $this, '_wp_post_revision_fields' ) ); add_action( 'xmlrpc_call', array( $this, 'xmlrpc_actions' ) ); + add_filter( 'content_save_pre', array( $this, 'preserve_code_blocks' ), 1 ); if ( defined( 'XMLRPC_REQUEST' ) && XMLRPC_REQUEST ) { $this->check_for_mwgetpost(); } @@ -116,6 +117,7 @@ public function unload_markdown_for_posts() { remove_action( 'wp_restore_post_revision', array( $this, 'wp_restore_post_revision' ), 10, 2 ); remove_filter( '_wp_post_revision_fields', array( $this, '_wp_post_revision_fields' ) ); remove_action( 'xmlrpc_call', array( $this, 'xmlrpc_actions' ) ); + remove_filter( 'content_save_pre', array( $this, 'preserve_code_blocks' ), 1 ); } /** @@ -194,6 +196,15 @@ public function o2_unescape_lists( $text ) { return preg_replace( '/^[&]\#042; /um', '* ', $text ); } + /** + * Preserve code blocks from being munged by KSES before they have a chance + * @param string $text post content + * @return string post content with code blocks escaped + */ + public function preserve_code_blocks( $text ) { + return $this->get_parser()->codeblock_preserve( $text ); + } + /** * Remove KSES if it's there. Store the result to manually invoke later if needed. * @return null @@ -348,8 +359,10 @@ protected function get_post_screen_post_type() { public function edit_post_content( $content, $id ) { if ( $this->is_markdown( $id ) ) { $post = get_post( $id ); - if ( $post && ! empty( $post->post_content_filtered ) ) - $content = $post->post_content_filtered; + if ( $post && ! empty( $post->post_content_filtered ) ) { + $post = $this->swap_for_editing( $post ); + return $post->post_content; + } } return $content; } @@ -462,12 +475,16 @@ protected function comment_hash( $content ) { * @param array $args Arguments, with keys: * id: provide a string to prefix footnotes with a unique identifier * unslash: when true, expects and returns slashed data + * decode_code_blocks: when true, assume that text in fenced code blocks is already + * HTML encoded and should be decoded before being passed to Markdown, which does + * its own encoding. * @return string Markdown-processed content */ public function transform( $text, $args = array() ) { $args = wp_parse_args( $args, array( 'id' => false, - 'unslash' => true + 'unslash' => true, + 'decode_code_blocks' => ! $this->get_parser()->use_code_shortcode ) ); // probably need to unslash if ( $args['unslash'] ) @@ -482,6 +499,10 @@ public function transform( $text, $args = array() ) { $text = preg_replace( '/^>/m', '>', $text ); // prefixes are because we need to namespace footnotes by post_id $this->get_parser()->fn_id_prefix = $args['id'] ? $args['id'] . '-' : ''; + // If we're not using the code shortcode, prevent over-encoding. + if ( $args['decode_code_blocks'] ) { + $text = $this->get_parser()->codeblock_restore( $text ); + } // Transform it! $text = $this->get_parser()->transform( $text ); // Fix footnotes - kses doesn't like the : IDs it supplies @@ -595,9 +616,7 @@ private function prime_post_cache( $post_id = false ) { $post = get_post( $post_id ); if ( ! empty( $post->post_content_filtered ) ) { wp_cache_delete( $post->ID, 'posts' ); - $markdown = $post->post_content_filtered; - $post->post_content_filtered = $post->post_content; - $post->post_content = $markdown; + $post = $this->swap_for_editing( $post ); wp_cache_add( $post->ID, $post, 'posts' ); $this->posts_to_uncache[] = $post_id; } @@ -608,6 +627,23 @@ private function prime_post_cache( $post_id = false ) { } } + /** + * Swaps `post_content_filtered` back to `post_content` for editing purposes. + * @param object $post WP_Post object + * @return object WP_Post object with swapped `post_content_filtered` and `post_content` + */ + protected function swap_for_editing( $post ) { + $markdown = $post->post_content_filtered; + // unencode encoded code blocks + $markdown = $this->get_parser()->codeblock_restore( $markdown ); + // restore beginning of line blockquotes + $markdown = preg_replace( '/^> /m', '> ', $markdown ); + $post->post_content_filtered = $post->post_content; + $post->post_content = $markdown; + return $post; + } + + /** * We munge the post cache to serve proper markdown content to XML-RPC clients. * Uncache these after the XML-RPC session ends.