From 0751f40df782be3da8983c2d676ec0dbc00bdbe1 Mon Sep 17 00:00:00 2001 From: CodingPF Date: Sat, 24 Feb 2024 12:18:54 +0100 Subject: [PATCH] resolve missing subtitles --- .../base/webaccess/JsoupConnection.java | 40 +++++++++++- .../mserver/crawler/orfon/OrfOnCrawler.java | 2 +- .../orfon/json/OrfOnEpisodeDeserializer.java | 63 ++++++++++++------- .../orfon/json/OrfOnEpisodesDeserializer.java | 7 ++- .../OrfOnHistoryChildrenDeserializer.java | 6 -- .../orfon/json/OrfOnHistoryDeserializer.java | 1 - .../orfon/json/OrfOnScheduleDeserializer.java | 1 - .../crawler/orfon/task/OrfOnEpisodeTask.java | 2 +- .../crawler/orfon/task/OrfOnEpisodesTask.java | 2 +- 9 files changed, 90 insertions(+), 34 deletions(-) diff --git a/src/main/java/de/mediathekview/mserver/base/webaccess/JsoupConnection.java b/src/main/java/de/mediathekview/mserver/base/webaccess/JsoupConnection.java index 6ff257725..18330f14c 100644 --- a/src/main/java/de/mediathekview/mserver/base/webaccess/JsoupConnection.java +++ b/src/main/java/de/mediathekview/mserver/base/webaccess/JsoupConnection.java @@ -1,6 +1,7 @@ package de.mediathekview.mserver.base.webaccess; import okhttp3.ConnectionPool; +import okhttp3.Headers; import okhttp3.OkHttpClient; import okhttp3.Request; import okhttp3.Response; @@ -11,7 +12,12 @@ import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; +import com.google.gson.Gson; +import com.google.gson.JsonElement; + import java.io.IOException; +import java.util.Map; +import java.util.Map.Entry; import java.util.concurrent.TimeUnit; import static jakarta.ws.rs.core.HttpHeaders.CONTENT_LENGTH; @@ -41,11 +47,32 @@ public JsoupConnection(final int timeout, final int threadPoolSize) { * @throws IOException If no connection to the url could be opened. */ public String requestBodyAsString(final String url) throws IOException { + return requestBodyAsString(url, null); + + } + /** + * Request an url and receive the body as String. Add headers as a string map. + * @param url + * @param headerMap + * @return + * @throws IOException + */ + public String requestBodyAsString(final String url, final Map headerMap) throws IOException { int retry = 0; int httpResponseCode; final String responseString = ""; do { - final Request request = new Request.Builder().url(url).build(); + okhttp3.Headers.Builder headerBuilder = new Headers.Builder(); + if (headerMap != null) { + for (Entry headerValue : headerMap.entrySet()) { + headerBuilder.add(headerValue.getKey(), headerValue.getValue()); + } + } + Request request = new Request.Builder() + .url(url) + .headers(headerBuilder.build()) + .build(); + try (final Response response = client.newCall(request).execute()) { httpResponseCode = response.code(); if (response.body() == null || httpResponseCode == 404 || httpResponseCode == 410) { @@ -62,6 +89,17 @@ public String requestBodyAsString(final String url) throws IOException { return responseString; } + /** + * Request an url and receive the body as HTML JSOUP Document + * + * @param url The url to request. + * @return request body as HTML JSOUP Document + * @throws IOException If no connection to the url could be opened. + */ + public JsonElement requestBodyAsJsonElement(final String url, final Map headerMap) throws IOException { + return new Gson().fromJson(requestBodyAsString(url, headerMap), JsonElement.class); + } + /** * Request an url and receive the body as HTML JSOUP Document * diff --git a/src/main/java/de/mediathekview/mserver/crawler/orfon/OrfOnCrawler.java b/src/main/java/de/mediathekview/mserver/crawler/orfon/OrfOnCrawler.java index 08cb73fc5..ea3be7d16 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/orfon/OrfOnCrawler.java +++ b/src/main/java/de/mediathekview/mserver/crawler/orfon/OrfOnCrawler.java @@ -65,7 +65,7 @@ protected RecursiveTask> createCrawlerTask() { printMessage(ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), allVideos.size()); getAndSetMaxCount(allVideos.size()); // - // History (top categories) > children > + // History (top categories) > children > VideoItem > Episode > Episode2Film final Set historyVideos = processHistoryUrlToCrawl(); allVideos.addAll(historyVideos); printMessage(ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), allVideos.size()); diff --git a/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnEpisodeDeserializer.java b/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnEpisodeDeserializer.java index 3db8b44d9..6707d80a2 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnEpisodeDeserializer.java +++ b/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnEpisodeDeserializer.java @@ -6,8 +6,11 @@ import de.mediathekview.mlib.daten.GeoLocations; import de.mediathekview.mlib.daten.Resolution; import de.mediathekview.mserver.base.utils.JsonUtils; +import de.mediathekview.mserver.crawler.basic.AbstractCrawler; +import de.mediathekview.mserver.crawler.orfon.OrfOnConstants; import de.mediathekview.mserver.crawler.orfon.OrfOnVideoInfoDTO; +import java.io.IOException; import java.lang.reflect.Type; import java.net.MalformedURLException; import java.net.URL; @@ -47,11 +50,19 @@ public class OrfOnEpisodeDeserializer implements JsonDeserializer> buildOrResolveSubs(JsonElement jsonElement) { + Optional subtitleSource = JsonUtils.getElementValueAsString(jsonElement, TAG_SUBTITLE); + Optional embeddedSubtitleSection = JsonUtils.getElement(jsonElement, TAG_SUBTITLE_SECTION); + Optional> setOfSubs = Optional.empty(); + if (embeddedSubtitleSection.isPresent()) { + setOfSubs = parseSubtitleUrls(embeddedSubtitleSection.get()); + } else if (subtitleSource.isPresent()) { + Map myMap = Map.ofEntries( + Map.entry("Authorization", OrfOnConstants.AUTH), + Map.entry("Accept-Charset", "UTF_8"), + Map.entry("User-Agent", "Mozilla"), + Map.entry("Accept-Encoding", "*")); + JsonElement newRequestForSubs = null; + try { + newRequestForSubs = crawler.getConnection().requestBodyAsJsonElement(subtitleSource.get().toString(), myMap); + setOfSubs = parseSubtitleUrls(newRequestForSubs); + } catch (IOException e) { + LOG.error("Failed to resolve subtitle from {} error {}", subtitleSource, e); + } + + } + return setOfSubs; + } + private Optional parseSubtitleSource(Optional text) { Optional sub = Optional.empty(); if (text.isPresent()) { @@ -119,7 +142,6 @@ private Optional parseSubtitleSource(Optional text) { } - private Optional> parseSubtitleUrls(JsonElement element) { Set urls = new HashSet<>(); JsonUtils.getElementValueAsString(element, TAG_SUBTITLE_SMI).ifPresent( stringUrl -> toURL(stringUrl).ifPresent(urls::add)); @@ -150,8 +172,7 @@ private Optional> parseUrl(JsonElement jsonElement) { LOG.debug("unkown video type {} ", jsonElement); } } - - + Optional> urls = Optional.empty(); Optional codec = Optional.empty(); // if (jsonElement.getAsJsonObject().has(TAG_VIDEO) && diff --git a/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnEpisodesDeserializer.java b/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnEpisodesDeserializer.java index 8a7637dcf..d8510d988 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnEpisodesDeserializer.java +++ b/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnEpisodesDeserializer.java @@ -3,6 +3,7 @@ import com.google.gson.*; import de.mediathekview.mserver.base.utils.JsonUtils; +import de.mediathekview.mserver.crawler.basic.AbstractCrawler; import de.mediathekview.mserver.crawler.basic.PagedElementListDTO; import de.mediathekview.mserver.crawler.orfon.OrfOnVideoInfoDTO; @@ -12,7 +13,11 @@ public class OrfOnEpisodesDeserializer implements JsonDeserializer> { private static final String[] TAG_NEXT_PAGE = {"_links", "next", "href"}; private static final String[] TAG_ITEMS = {"_embedded", "items"}; - private static final OrfOnEpisodeDeserializer itemDeserializer = new OrfOnEpisodeDeserializer(); + private OrfOnEpisodeDeserializer itemDeserializer = null; + + public OrfOnEpisodesDeserializer(AbstractCrawler crawler) { + itemDeserializer = new OrfOnEpisodeDeserializer(crawler); + } @Override public PagedElementListDTO deserialize( diff --git a/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnHistoryChildrenDeserializer.java b/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnHistoryChildrenDeserializer.java index a9ce3a8bd..c0cb744fb 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnHistoryChildrenDeserializer.java +++ b/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnHistoryChildrenDeserializer.java @@ -49,12 +49,6 @@ public PagedElementListDTO deserialize( } else { LOG.info("No video_items or children tag found {}",JsonUtils.getElementValueAsString(item, TAG_ITEM_TITLE) ); } - /* - LOG.debug("OrfOnHistoryChildrenDeserializer {} - {} - {}", - JsonUtils.getElementValueAsString(item, TAG_ITEM_TITLE), - JsonUtils.getElementValueAsString(item, TAG_TARGET_URL), - JsonUtils.getElementValueAsString(item, TAG_TARGET_URL2));*/ - } } // diff --git a/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnHistoryDeserializer.java b/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnHistoryDeserializer.java index 16793e466..fd718d41d 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnHistoryDeserializer.java +++ b/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnHistoryDeserializer.java @@ -55,7 +55,6 @@ public PagedElementListDTO parseSection(JsonArray itemArr } else { LOG.debug("missing url for {}", title); } - //LOG.debug("History Item {} {}", title, url); } return items; } diff --git a/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnScheduleDeserializer.java b/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnScheduleDeserializer.java index e5baf69da..09d90b91e 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnScheduleDeserializer.java +++ b/src/main/java/de/mediathekview/mserver/crawler/orfon/json/OrfOnScheduleDeserializer.java @@ -26,7 +26,6 @@ public PagedElementListDTO deserialize( final Optional id = JsonUtils.getElementValueAsString(element, TAG_FILM_ID); if (id.isPresent()) { final String url = OrfOnConstants.EPISODE + "/" + id.get(); - //LOG.debug("found {} {} {}", id, name, url); collectIds.addElement(new OrfOnBreadCrumsUrlDTO(id.get(), url)); } } diff --git a/src/main/java/de/mediathekview/mserver/crawler/orfon/task/OrfOnEpisodeTask.java b/src/main/java/de/mediathekview/mserver/crawler/orfon/task/OrfOnEpisodeTask.java index 56fa91f3b..7ea68931c 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/orfon/task/OrfOnEpisodeTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/orfon/task/OrfOnEpisodeTask.java @@ -30,7 +30,7 @@ public OrfOnEpisodeTask(AbstractCrawler crawler, Queue ur @Override protected JsonDeserializer getParser(OrfOnBreadCrumsUrlDTO aDTO) { - return new OrfOnEpisodeDeserializer(); + return new OrfOnEpisodeDeserializer(this.crawler); } @Override diff --git a/src/main/java/de/mediathekview/mserver/crawler/orfon/task/OrfOnEpisodesTask.java b/src/main/java/de/mediathekview/mserver/crawler/orfon/task/OrfOnEpisodesTask.java index 95ece1e96..20438c7f4 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/orfon/task/OrfOnEpisodesTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/orfon/task/OrfOnEpisodesTask.java @@ -34,7 +34,7 @@ public OrfOnEpisodesTask(AbstractCrawler crawler, Queue u @Override protected JsonDeserializer> getParser(OrfOnBreadCrumsUrlDTO aDTO) { - return new OrfOnEpisodesDeserializer(); + return new OrfOnEpisodesDeserializer(this.crawler); } @Override