From 097fdf6a5d1ecd373c911bda4a1d7ee3c873fa21 Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Wed, 12 Jun 2019 17:56:51 -0500 Subject: [PATCH 1/4] Attempt to use from HTML as a fallback --- lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex index 4a7c5eae0..7da4e7561 100644 --- a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex +++ b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex @@ -1,12 +1,14 @@ defmodule Pleroma.Web.RichMedia.Parsers.MetaTagsParser do def parse(html, data, prefix, error_message, key_name, value_name \\ "content") do with elements = [_ | _] <- get_elements(html, key_name, prefix), + page_title = get_page_title(html), meta_data = Enum.reduce(elements, data, fn el, acc -> attributes = normalize_attributes(el, prefix, key_name, value_name) Map.merge(acc, attributes) - end) do + end) + |> Map.put_new(:title, page_title) do {:ok, meta_data} else _e -> {:error, error_message} @@ -27,4 +29,8 @@ defp normalize_attributes(html_node, prefix, key_name, value_name) do %{String.to_atom(data[key_name]) => data[value_name]} end + + defp get_page_title(html) do + Floki.find(html, "title") |> Floki.text() + end end From 97d2b1a45ab12c530dd730518b9d8ca546bbc9f2 Mon Sep 17 00:00:00 2001 From: Mark Felder <feld@FreeBSD.org> Date: Wed, 12 Jun 2019 18:27:35 -0500 Subject: [PATCH 2/4] Only run Floki if title is missing from the map --- .../web/rich_media/parsers/meta_tags_parser.ex | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex index 7da4e7561..8c42557aa 100644 --- a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex +++ b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex @@ -1,15 +1,14 @@ defmodule Pleroma.Web.RichMedia.Parsers.MetaTagsParser do def parse(html, data, prefix, error_message, key_name, value_name \\ "content") do with elements = [_ | _] <- get_elements(html, key_name, prefix), - page_title = get_page_title(html), meta_data = Enum.reduce(elements, data, fn el, acc -> attributes = normalize_attributes(el, prefix, key_name, value_name) Map.merge(acc, attributes) - end) - |> Map.put_new(:title, page_title) do - {:ok, meta_data} + end) do + rich_meta_data = maybe_use_page_title(meta_data, html) + {:ok, rich_meta_data} else _e -> {:error, error_message} end @@ -30,7 +29,10 @@ defp normalize_attributes(html_node, prefix, key_name, value_name) do %{String.to_atom(data[key_name]) => data[value_name]} end - defp get_page_title(html) do - Floki.find(html, "title") |> Floki.text() + defp maybe_use_page_title(meta_data, html) do + if !Map.has_key?(meta_data, :title) do + page_title = Floki.find(html, "title") |> Floki.text() + Map.put_new(meta_data, :title, page_title) + end end end From 7363a0ea8aa5c034e0335e826c081f1166e71f92 Mon Sep 17 00:00:00 2001 From: Mark Felder <feld@FreeBSD.org> Date: Wed, 12 Jun 2019 18:32:28 -0500 Subject: [PATCH 3/4] Revert "Only run Floki if title is missing from the map" This reverts commit 97d2b1a45ab12c530dd730518b9d8ca546bbc9f2. --- .../web/rich_media/parsers/meta_tags_parser.ex | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex index 8c42557aa..7da4e7561 100644 --- a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex +++ b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex @@ -1,14 +1,15 @@ defmodule Pleroma.Web.RichMedia.Parsers.MetaTagsParser do def parse(html, data, prefix, error_message, key_name, value_name \\ "content") do with elements = [_ | _] <- get_elements(html, key_name, prefix), + page_title = get_page_title(html), meta_data = Enum.reduce(elements, data, fn el, acc -> attributes = normalize_attributes(el, prefix, key_name, value_name) Map.merge(acc, attributes) - end) do - rich_meta_data = maybe_use_page_title(meta_data, html) - {:ok, rich_meta_data} + end) + |> Map.put_new(:title, page_title) do + {:ok, meta_data} else _e -> {:error, error_message} end @@ -29,10 +30,7 @@ defp normalize_attributes(html_node, prefix, key_name, value_name) do %{String.to_atom(data[key_name]) => data[value_name]} end - defp maybe_use_page_title(meta_data, html) do - if !Map.has_key?(meta_data, :title) do - page_title = Floki.find(html, "title") |> Floki.text() - Map.put_new(meta_data, :title, page_title) - end + defp get_page_title(html) do + Floki.find(html, "title") |> Floki.text() end end From a12f8e13c8f3cd176989c28810ff578bf7c09c69 Mon Sep 17 00:00:00 2001 From: Egor Kislitsyn <egor@kislitsyn.com> Date: Thu, 13 Jun 2019 15:02:46 +0700 Subject: [PATCH 4/4] Improve <title> fallback; Add a test --- .../rich_media/parsers/meta_tags_parser.ex | 31 +++++++++++++------ .../rich_media/ogp-missing-title.html | 12 +++++++ test/web/rich_media/parser_test.exs | 22 +++++++++++++ 3 files changed, 55 insertions(+), 10 deletions(-) create mode 100644 test/fixtures/rich_media/ogp-missing-title.html diff --git a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex index 7da4e7561..82f1cce29 100644 --- a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex +++ b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex @@ -1,17 +1,19 @@ defmodule Pleroma.Web.RichMedia.Parsers.MetaTagsParser do def parse(html, data, prefix, error_message, key_name, value_name \\ "content") do - with elements = [_ | _] <- get_elements(html, key_name, prefix), - page_title = get_page_title(html), - meta_data = - Enum.reduce(elements, data, fn el, acc -> - attributes = normalize_attributes(el, prefix, key_name, value_name) + meta_data = + html + |> get_elements(key_name, prefix) + |> Enum.reduce(data, fn el, acc -> + attributes = normalize_attributes(el, prefix, key_name, value_name) - Map.merge(acc, attributes) - end) - |> Map.put_new(:title, page_title) do - {:ok, meta_data} + Map.merge(acc, attributes) + end) + |> maybe_put_title(html) + + if Enum.empty?(meta_data) do + {:error, error_message} else - _e -> {:error, error_message} + {:ok, meta_data} end end @@ -30,6 +32,15 @@ defp normalize_attributes(html_node, prefix, key_name, value_name) do %{String.to_atom(data[key_name]) => data[value_name]} end + defp maybe_put_title(%{title: _} = meta, _), do: meta + + defp maybe_put_title(meta, html) do + case get_page_title(html) do + "" -> meta + title -> Map.put_new(meta, :title, title) + end + end + defp get_page_title(html) do Floki.find(html, "title") |> Floki.text() end diff --git a/test/fixtures/rich_media/ogp-missing-title.html b/test/fixtures/rich_media/ogp-missing-title.html new file mode 100644 index 000000000..fcdbedfc6 --- /dev/null +++ b/test/fixtures/rich_media/ogp-missing-title.html @@ -0,0 +1,12 @@ +<html prefix="og: http://ogp.me/ns#"> + +<head> + <title>The Rock (1996) + + + + + + + diff --git a/test/web/rich_media/parser_test.exs b/test/web/rich_media/parser_test.exs index 3a9cc1854..a49ba9549 100644 --- a/test/web/rich_media/parser_test.exs +++ b/test/web/rich_media/parser_test.exs @@ -9,6 +9,15 @@ defmodule Pleroma.Web.RichMedia.ParserTest do } -> %Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/ogp.html")} + %{ + method: :get, + url: "http://example.com/ogp-missing-title" + } -> + %Tesla.Env{ + status: 200, + body: File.read!("test/fixtures/rich_media/ogp-missing-title.html") + } + %{ method: :get, url: "http://example.com/twitter-card" @@ -51,6 +60,19 @@ test "parses ogp" do }} end + test "falls back to when ogp:title is missing" do + assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/ogp-missing-title") == + {:ok, + %{ + image: "http://ia.media-imdb.com/images/rock.jpg", + title: "The Rock (1996)", + description: + "Directed by Michael Bay. With Sean Connery, Nicolas Cage, Ed Harris, John Spencer.", + type: "video.movie", + url: "http://www.imdb.com/title/tt0117500/" + }} + end + test "parses twitter card" do assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/twitter-card") == {:ok,