From 9eb30dfb1c9fc10fc37e8f63a534ac9d1159eea8 Mon Sep 17 00:00:00 2001 From: Calvin Lee Date: Sun, 24 Sep 2023 12:44:25 +0000 Subject: [PATCH] Sanitize MathML in post content Summary: ------- This commit correctly sanitizes incoming MathML according to [FEP-dc88]. Instead of completely removing MathML nodes, it replaces them with their LaTeX or plain-text representation, so that the mathematics can be read in some form by mastodon users. Test Plan: ---------- ``` $ RAILS_ENV=test bundle exec rspec spec/lib/sanitize_config_spec.rb -f d Run options: exclude {:type=>#} Randomized with seed 58854 Sanitize::Config ::MASTODON_STRICT sanitizes math blocks to LaTeX converts h1 to p strong removes "translate" attribute with invalid value removes a without href removes a without href and only keeps text content math sanitizer falls back to plaintext keeps ul prefers latex removes a with unparsable href keeps start and reversed attributes of ol removes a with unsupported scheme in href keeps a with translate="no" keeps a with href keeps a with supported scheme and no host does not re-interpret HTML when removing unsupported links sanitizes math to LaTeX Finished in 0.17323 seconds (files took 3.28 seconds to load) 16 examples, 0 failures Randomized with seed 58854 ``` observed 100% code coverage of `lib/sanitize_ext/sanitize_config.rb`. Ran mastodon locally, and fetched [reference post][nyancat] and observed that math was converted to plaintext form (and was not missing). [FEP-dc88]: https://codeberg.org/fediverse/fep/src/branch/main/fep/dc88/fep-dc88.md [tracking]: https://codeberg.org/fediverse/fep/issues/161 [socialhub]: https://socialhub.activitypub.rocks/t/fep-dc88-formatting-mathematics/3564 [nyancat]: https://nyan.network/notice/Aa4IvnBVHysWswRX1s Related Discussion: ------------------- Please see [FEP-dc88], the [FEP tracking issue][tracking] and [FEP forum discussion][socialhub] for more information. Fixes mastodon/mastodon#26943 --- lib/sanitize_ext/sanitize_config.rb | 38 +++++++++++++++++++++++++++++ spec/lib/sanitize/config_spec.rb | 20 +++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/lib/sanitize_ext/sanitize_config.rb b/lib/sanitize_ext/sanitize_config.rb index 70efe7c1ae5..db81244104e 100644 --- a/lib/sanitize_ext/sanitize_config.rb +++ b/lib/sanitize_ext/sanitize_config.rb @@ -64,6 +64,43 @@ class Sanitize current_node.wrap('

') end + # We assume that incomming nodes are of the form + # ...... + # according to the [FEP]. We try to grab the most relevant plain-text + # annotation from the semantics node, and use it to display a representation + # of the mathematics. + # + # FEP: https://codeberg.org/fediverse/fep/src/branch/main/fep/dc88/fep-dc88.md + MATH_TRANSFORMER = lambda do |env| + math = env[:node] + return if env[:is_allowlisted] + return unless math.element? && env[:node_name] == 'math' + + semantics = math.element_children[0] + return if semantics.nil? || semantics.name != 'semantics' + + # next, we find the plain-text description + is_annotation_with_encoding = lambda do |encoding, node| + return false unless node.name == 'annotation' + + node.attributes['encoding'].value == encoding + end + + annotation = semantics.children.find(&is_annotation_with_encoding.curry['application/x-tex']) + if annotation + if math.attributes['display']&.value == 'block' + math.swap("$$#{annotation.content}$$") + else + math.swap("$#{annotation.content}$") + end + return + end + # Don't bother surrounding 'text/plain' annotations with dollar signs, + # since it isn't LaTeX + annotation = semantics.children.find(&is_annotation_with_encoding.curry['text/plain']) + math.swap(annotation.content) unless annotation.nil? + end + MASTODON_STRICT = freeze_config( elements: %w(p br span a del pre blockquote code b strong u i em ul ol li), @@ -86,6 +123,7 @@ class Sanitize transformers: [ CLASS_WHITELIST_TRANSFORMER, TRANSLATE_TRANSFORMER, + MATH_TRANSFORMER, UNSUPPORTED_ELEMENTS_TRANSFORMER, UNSUPPORTED_HREF_TRANSFORMER, ] diff --git a/spec/lib/sanitize/config_spec.rb b/spec/lib/sanitize/config_spec.rb index 2d8dc2f63be..aa39b9becab 100644 --- a/spec/lib/sanitize/config_spec.rb +++ b/spec/lib/sanitize/config_spec.rb @@ -53,5 +53,25 @@ describe Sanitize::Config do it 'keeps a with supported scheme and no host' do expect(Sanitize.fragment('Test', subject)).to eq 'Test' end + + it 'sanitizes math to LaTeX' do + mathml = 'xn+yx^n+y' + expect(Sanitize.fragment(mathml, subject)).to eq '$x^n+y$' + end + + it 'sanitizes math blocks to LaTeX' do + mathml = 'xn+yx^n+y' + expect(Sanitize.fragment(mathml, subject)).to eq '$$x^n+y$$' + end + + it 'math sanitizer falls back to plaintext' do + mathml = 'xsqrt(x)' + expect(Sanitize.fragment(mathml, subject)).to eq 'sqrt(x)' + end + + it 'prefers latex' do + mathml = 'xsqrt(x)\\sqrt x' + expect(Sanitize.fragment(mathml, subject)).to eq '$\sqrt x$' + end end end