mirror of
https://github.com/mastodon/mastodon.git
synced 2024-08-20 21:08:15 -07:00
Sanitize MathML in post content
Summary: ------- This commit correctly sanitizes incoming MathML according to [FEP-dc88]. Instead of completely removing MathML nodes, it replaces them with their LaTeX or plain-text representation, so that the mathematics can be read in some form by mastodon users. Test Plan: ---------- ``` $ RAILS_ENV=test bundle exec rspec spec/lib/sanitize_config_spec.rb -f d Run options: exclude {:type=>#<Proc: ./spec/rails_helper.rb:79>} Randomized with seed 58854 Sanitize::Config ::MASTODON_STRICT sanitizes math blocks to LaTeX converts h1 to p strong removes "translate" attribute with invalid value removes a without href removes a without href and only keeps text content math sanitizer falls back to plaintext keeps ul prefers latex removes a with unparsable href keeps start and reversed attributes of ol removes a with unsupported scheme in href keeps a with translate="no" keeps a with href keeps a with supported scheme and no host does not re-interpret HTML when removing unsupported links sanitizes math to LaTeX Finished in 0.17323 seconds (files took 3.28 seconds to load) 16 examples, 0 failures Randomized with seed 58854 ``` observed 100% code coverage of `lib/sanitize_ext/sanitize_config.rb`. Ran mastodon locally, and fetched [reference post][nyancat] and observed that math was converted to plaintext form (and was not missing). [FEP-dc88]: https://codeberg.org/fediverse/fep/src/branch/main/fep/dc88/fep-dc88.md [tracking]: https://codeberg.org/fediverse/fep/issues/161 [socialhub]: https://socialhub.activitypub.rocks/t/fep-dc88-formatting-mathematics/3564 [nyancat]: https://nyan.network/notice/Aa4IvnBVHysWswRX1s Related Discussion: ------------------- Please see [FEP-dc88], the [FEP tracking issue][tracking] and [FEP forum discussion][socialhub] for more information. Fixes mastodon/mastodon#26943
This commit is contained in:
parent
85fdbd0ad5
commit
9eb30dfb1c
2 changed files with 58 additions and 0 deletions
|
@ -64,6 +64,43 @@ class Sanitize
|
|||
current_node.wrap('<p></p>')
|
||||
end
|
||||
|
||||
# We assume that incomming <math> nodes are of the form
|
||||
# <math><semantics>...<annotation>...</annotation></semantics></math>
|
||||
# according to the [FEP]. We try to grab the most relevant plain-text
|
||||
# annotation from the semantics node, and use it to display a representation
|
||||
# of the mathematics.
|
||||
#
|
||||
# FEP: https://codeberg.org/fediverse/fep/src/branch/main/fep/dc88/fep-dc88.md
|
||||
MATH_TRANSFORMER = lambda do |env|
|
||||
math = env[:node]
|
||||
return if env[:is_allowlisted]
|
||||
return unless math.element? && env[:node_name] == 'math'
|
||||
|
||||
semantics = math.element_children[0]
|
||||
return if semantics.nil? || semantics.name != 'semantics'
|
||||
|
||||
# next, we find the plain-text description
|
||||
is_annotation_with_encoding = lambda do |encoding, node|
|
||||
return false unless node.name == 'annotation'
|
||||
|
||||
node.attributes['encoding'].value == encoding
|
||||
end
|
||||
|
||||
annotation = semantics.children.find(&is_annotation_with_encoding.curry['application/x-tex'])
|
||||
if annotation
|
||||
if math.attributes['display']&.value == 'block'
|
||||
math.swap("$$#{annotation.content}$$")
|
||||
else
|
||||
math.swap("$#{annotation.content}$")
|
||||
end
|
||||
return
|
||||
end
|
||||
# Don't bother surrounding 'text/plain' annotations with dollar signs,
|
||||
# since it isn't LaTeX
|
||||
annotation = semantics.children.find(&is_annotation_with_encoding.curry['text/plain'])
|
||||
math.swap(annotation.content) unless annotation.nil?
|
||||
end
|
||||
|
||||
MASTODON_STRICT = freeze_config(
|
||||
elements: %w(p br span a del pre blockquote code b strong u i em ul ol li),
|
||||
|
||||
|
@ -86,6 +123,7 @@ class Sanitize
|
|||
transformers: [
|
||||
CLASS_WHITELIST_TRANSFORMER,
|
||||
TRANSLATE_TRANSFORMER,
|
||||
MATH_TRANSFORMER,
|
||||
UNSUPPORTED_ELEMENTS_TRANSFORMER,
|
||||
UNSUPPORTED_HREF_TRANSFORMER,
|
||||
]
|
||||
|
|
|
@ -53,5 +53,25 @@ describe Sanitize::Config do
|
|||
it 'keeps a with supported scheme and no host' do
|
||||
expect(Sanitize.fragment('<a href="dweb:/a/foo">Test</a>', subject)).to eq '<a href="dweb:/a/foo" rel="nofollow noopener noreferrer" target="_blank">Test</a>'
|
||||
end
|
||||
|
||||
it 'sanitizes math to LaTeX' do
|
||||
mathml = '<math><semantics><mrow><msup><mi>x</mi><mi>n</mi></msup><mo>+</mo><mi>y</mi></mrow><annotation encoding="application/x-tex">x^n+y</annotation></semantics></math>'
|
||||
expect(Sanitize.fragment(mathml, subject)).to eq '$x^n+y$'
|
||||
end
|
||||
|
||||
it 'sanitizes math blocks to LaTeX' do
|
||||
mathml = '<math display="block"><semantics><mrow><msup><mi>x</mi><mi>n</mi></msup><mo>+</mo><mi>y</mi></mrow><annotation encoding="application/x-tex">x^n+y</annotation></semantics></math>'
|
||||
expect(Sanitize.fragment(mathml, subject)).to eq '$$x^n+y$$'
|
||||
end
|
||||
|
||||
it 'math sanitizer falls back to plaintext' do
|
||||
mathml = '<math><semantics><msqrt><mi>x</mi></msqrt><annotation encoding="text/plain">sqrt(x)</annotation></semantics></math>'
|
||||
expect(Sanitize.fragment(mathml, subject)).to eq 'sqrt(x)'
|
||||
end
|
||||
|
||||
it 'prefers latex' do
|
||||
mathml = '<math><semantics><msqrt><mi>x</mi></msqrt><annotation encoding="text/plain">sqrt(x)</annotation><annotation encoding="application/x-tex">\\sqrt x</annotation></semantics></math>'
|
||||
expect(Sanitize.fragment(mathml, subject)).to eq '$\sqrt x$'
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in a new issue