diff --git a/app/lib/link_details_extractor.rb b/app/lib/link_details_extractor.rb index bd78aef7a9f..bebc9c9fd72 100644 --- a/app/lib/link_details_extractor.rb +++ b/app/lib/link_details_extractor.rb @@ -157,7 +157,7 @@ class LinkDetailsExtractor end def title - html_entities_decode(structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first)&.strip + html_entities_decode(structured_data&.headline || opengraph_tag('og:title') || head.at_xpath('title')&.content)&.strip end def description @@ -205,11 +205,11 @@ class LinkDetailsExtractor end def language - valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.xpath('//html').pick('lang')) + valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.root.attr('lang')) end def icon - valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('shortcut icon')) + valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('icon')) end private @@ -237,11 +237,11 @@ class LinkDetailsExtractor end def link_tag(name) - document.xpath("//link[@rel=\"#{name}\"]").pick('href') + head.xpath('//link[@rel][@href]').find { |el| el.kwattr_values('rel').any? { |v| name.casecmp?(v) } }&.attr('href') end def opengraph_tag(name) - document.xpath("//meta[@property=\"#{name}\" or @name=\"#{name}\"]").pick('content') + head.at_xpath("//meta[@property='#{name}' or @name='#{name}'][@content]")&.attr('content') end def meta_tag(name) @@ -249,6 +249,8 @@ class LinkDetailsExtractor end def structured_data + return @structured_data if defined?(@structured_data) + # Some publications have more than one JSON-LD definition on the page, # and some of those definitions aren't valid JSON either, so we have # to loop through here until we find something that is the right type @@ -273,6 +275,10 @@ class LinkDetailsExtractor @document ||= detect_encoding_and_parse_document end + def head + @head ||= document.at_xpath('/html/head') + end + def detect_encoding_and_parse_document [detect_encoding, nil, header_encoding].uniq.each do |encoding| document = Nokogiri::HTML(@html, nil, encoding) diff --git a/spec/lib/link_details_extractor_spec.rb b/spec/lib/link_details_extractor_spec.rb index b1e5cedced3..e7f22eddabb 100644 --- a/spec/lib/link_details_extractor_spec.rb +++ b/spec/lib/link_details_extractor_spec.rb @@ -42,6 +42,7 @@ RSpec.describe LinkDetailsExtractor do