From d851238cc6dc1a0a2f16e3624e0bece6129d3aed Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Sat, 3 Feb 2024 14:45:52 -0800 Subject: [PATCH] Parse YAML-LD embedded in HTML script elements. --- examples/indented-stream.yaml | 9 ++ lib/yaml_ld/api.rb | 15 ++++ spec/expand_spec.rb | 163 ++++++++++++++++++++++++++++++++++ spec/suite_helper.rb | 100 +++++++++++++++++++++ yaml-ld.gemspec | 1 + 5 files changed, 288 insertions(+) create mode 100644 examples/indented-stream.yaml diff --git a/examples/indented-stream.yaml b/examples/indented-stream.yaml new file mode 100644 index 0000000..4cc0c2d --- /dev/null +++ b/examples/indented-stream.yaml @@ -0,0 +1,9 @@ + "@context": https://json-ld.org/contexts/person.jsonld + "@id": http://dbpedia.org/resource/John_Lennon + name: John Lennon + born: 1940-10-09 + spouse: http://dbpedia.org/resource/Cynthia_Lennon + --- + "@context": https://json-ld.org/contexts/person.jsonld + "@id": http://dbpedia.org/resource/Cynthia_Lennon + born: 1939-09-10 diff --git a/lib/yaml_ld/api.rb b/lib/yaml_ld/api.rb index 1e66e0c..444e806 100644 --- a/lib/yaml_ld/api.rb +++ b/lib/yaml_ld/api.rb @@ -73,6 +73,7 @@ def self.expand(input, serializer: self.method(:serializer), **options, &block) + JSON::LD::API.add_script_loader('application/ld+yaml', self.method(:htmlLoader)) JSON::LD::API.expand(input, allowed_content_types: %r(application/(.+\+)?yaml), documentLoader: documentLoader, @@ -113,6 +114,7 @@ def self.compact(input, context, expanded: false, serializer: self.method(:serializer), **options, &block) + JSON::LD::API.add_script_loader('application/ld+yaml', self.method(:htmlLoader)) JSON::LD::API.compact(input, context, expanded: expanded, allowed_content_types: %r(application/(.+\+)?yaml), documentLoader: documentLoader, @@ -154,6 +156,7 @@ def self.flatten(input, context, expanded: false, serializer: self.method(:serializer), **options, &block) + JSON::LD::API.add_script_loader('application/ld+yaml', self.method(:htmlLoader)) JSON::LD::API.flatten(input, context, expanded: expanded, allowed_content_types: %r(application/(.+\+)?yaml), documentLoader: documentLoader, @@ -200,6 +203,7 @@ def self.frame(input, frame, expanded: false, serializer: self.method(:serializer), **options, &block) + JSON::LD::API.add_script_loader('application/ld+yaml', self.method(:htmlLoader)) JSON::LD::API.frame(input, frame, expanded: expanded, allowed_content_types: %r(application/(.+\+)?yaml), documentLoader: documentLoader, @@ -229,6 +233,7 @@ def self.toRdf(input, expanded: false, documentLoader: self.method(:documentLoader), **options, &block) + JSON::LD::API.add_script_loader('application/ld+yaml', self.method(:htmlLoader)) JSON::LD::API.toRdf(input, expanded: expanded, allowed_content_types: %r(application/(.+\+)?yaml), documentLoader: documentLoader, @@ -334,6 +339,16 @@ def self.documentLoader(url, extractAllScripts: false, profile: nil, requestProf end end + ## + # Extracts a single YAML script, or a stream of YAML scripts from HTML script tags. + def self.htmlLoader(content, url:, extractAllScripts: false, **options) + if extractAllScripts + Representation.load_stream(content.unindent, filename: url.to_s, **options) + else + Representation.load(content, filename: url.to_s, **options) + end + end + ## # The default serializer for serialzing Ruby Objects to JSON. # diff --git a/spec/expand_spec.rb b/spec/expand_spec.rb index b214744..5f4c8f2 100644 --- a/spec/expand_spec.rb +++ b/spec/expand_spec.rb @@ -476,6 +476,169 @@ end end + context "html" do + %w[REXML].each do |impl| + next unless Module.constants.map(&:to_s).include?(impl) + + context impl do + let(:library) { impl.downcase.to_s.to_sym } + + { + 'Expands embedded YAML-LD script element': { + input: %( + + + + + ), + output: %([{ + "http://example.com/foo": [{"@list": [{"@value": "bar"}]}] + }]) + }, + 'Expands first script element': { + input: %( + + + + + + ), + output: %([{ + "http://example.com/foo": [{"@list": [{"@value": "bar"}]}] + }]) + }, + 'Expands targeted script element': { + input: %( + + + + + + ), + output: %([ + {"http://example.com/foo": [{"@value": "foo"}]}, + {"http://example.com/bar": [{"@value": "bar"}]} + ]), + base: "http://example.org/doc#second" + }, + 'Expands all script elements with extractAllScripts option': { + input: %( + + + + + + ), + output: %([ + {"http://example.com/foo": [{"@list": [{"@value": "bar"}]}]}, + { + "@graph": [{ + "http://example.com/foo": [{"@value": "foo"}] + }, { + "http://example.com/bar": [{"@value": "bar"}] + }] + } + ]), + extractAllScripts: true + }, + 'Expands all script elements with extractAllScripts option (doc stream)': { + input: %( + + + + + ), + output: %([ + {"http://example.com/foo": [{"@list": [{"@value": "bar"}]}]}, + { + "@graph": [{ + "http://example.com/foo": [{"@value": "foo"}] + }, { + "http://example.com/bar": [{"@value": "bar"}] + }] + } + ]), + extractAllScripts: true + }, + }.each do |title, params| + it(title) do + skip "rexml" if params[:not] == library + params = params.merge(input: StringIO.new(params[:input])) + params[:input].send(:define_singleton_method, :content_type) { "text/html" } + run_expand params.merge(validate: true, library: library) + end + end + end + end + end + context "JSON-LD-star" do { "node with embedded subject without rdfstar option": { diff --git a/spec/suite_helper.rb b/spec/suite_helper.rb index 4c413c7..360c649 100644 --- a/spec/suite_helper.rb +++ b/spec/suite_helper.rb @@ -406,5 +406,105 @@ def documentLoader(url, **options, &block) "don't raise error" end module_function :documentLoader + + ## + # Load one or more script tags from an HTML source. + # Unescapes and uncomments input, returns the internal representation + # Yields document base + # @param [String] input + # @param [String] url Original URL + # @param [:nokogiri, :rexml] library (nil) + # @param [Boolean] extractAllScripts (false) + # @param [Boolean] profile (nil) Optional priortized profile when loading a single script by type. + # @param [Hash{Symbol => Object}] options + def self.load_html(input, url:, + library: nil, + extractAllScripts: false, + profile: nil, + **options) + + if input.is_a?(String) + library ||= begin + require 'nokogiri' + :nokogiri + rescue LoadError + :rexml + end + require "json/ld/html/#{library}" + + # Parse HTML using the appropriate library + implementation = case library + when :nokogiri then Nokogiri + when :rexml then REXML + end + extend(implementation) + + input = begin + send("initialize_html_#{library}".to_sym, input, **options) + rescue StandardError + raise JSON::LD::JsonLdError::LoadingDocumentFailed, "Malformed HTML document: #{$ERROR_INFO.message}" + end + + # Potentially update options[:base] + if (html_base = input.at_xpath("/html/head/base/@href")) + base = RDF::URI(url) if url + html_base = RDF::URI(html_base) + html_base = base.join(html_base) if base + yield html_base + end + end + + url = RDF::URI.parse(url) + if url.fragment + id = CGI.unescape(url.fragment) + # Find script with an ID based on that fragment. + element = input.at_xpath("//script[@id='#{id}']") + raise JSON::LD::JsonLdError::LoadingDocumentFailed, "No script tag found with id=#{id}" unless element + + unless element.attributes['type'].to_s.start_with?('application/ld+json') + raise JSON::LD::JsonLdError::LoadingDocumentFailed, + "Script tag has type=#{element.attributes['type']}" + end + + content = element.inner_html + validate_input(content, url: url) if options[:validate] + mj_opts = options.keep_if { |k, v| k != :adapter || MUTLI_JSON_ADAPTERS.include?(v) } + MultiJson.load(content, **mj_opts) + elsif extractAllScripts + res = [] + elements = if profile + es = input.xpath("//script[starts-with(@type, 'application/ld+json;profile=#{profile}')]") + # If no profile script, just take a single script without profile + es = [input.at_xpath("//script[starts-with(@type, 'application/ld+json')]")].compact if es.empty? + es + else + input.xpath("//script[starts-with(@type, 'application/ld+json')]") + end + elements.each do |element| + content = element.inner_html + validate_input(content, url: url) if options[:validate] + mj_opts = options.keep_if { |k, v| k != :adapter || MUTLI_JSON_ADAPTERS.include?(v) } + r = MultiJson.load(content, **mj_opts) + if r.is_a?(Hash) + res << r + elsif r.is_a?(Array) + res.concat(r) + end + end + res + else + # Find the first script with type application/ld+json. + element = input.at_xpath("//script[starts-with(@type, 'application/ld+json;profile=#{profile}')]") if profile + element ||= input.at_xpath("//script[starts-with(@type, 'application/ld+json')]") + raise JSON::LD::JsonLdError::LoadingDocumentFailed, "No script tag found" unless element + + content = element.inner_html + validate_input(content, url: url) if options[:validate] + mj_opts = options.keep_if { |k, v| k != :adapter || MUTLI_JSON_ADAPTERS.include?(v) } + MultiJson.load(content, **mj_opts) + end + rescue MultiJson::ParseError => e + raise JSON::LD::JsonLdError::InvalidScriptElement, e.message + end end end diff --git a/yaml-ld.gemspec b/yaml-ld.gemspec index 03027da..1a32fbe 100755 --- a/yaml-ld.gemspec +++ b/yaml-ld.gemspec @@ -33,6 +33,7 @@ Gem::Specification.new do |gem| gem.add_runtime_dependency 'psych', '>= 3.3' # Rails 6.0 cannot use psych 4.0 gem.add_runtime_dependency 'rdf', '~> 3.3' gem.add_runtime_dependency 'rdf-xsd', '~> 3.3' + gem.add_runtime_dependency 'rexml', '~> 3.2' gem.add_development_dependency 'getoptlong', '~> 0.2' gem.add_development_dependency 'rdf-isomorphic', '~> 3.3' gem.add_development_dependency 'rdf-spec', '~> 3.3'