From 0c3cb99fa30de4debf70689157b993fbbf1940e9 Mon Sep 17 00:00:00 2001 From: pidoubleyou Date: Sat, 6 Jan 2024 12:05:38 +0100 Subject: [PATCH] search more episodes --- .../mserver/crawler/orf/OrfCrawler.java | 6 +- .../orf/json/OrfMoreEpisodesDeserializer.java | 26 + .../orf/parser/OrfMoreEpisodesParser.java | 25 + .../crawler/orf/tasks/OrfFilmDetailTask.java | 62 +- .../json/OrfMoreEpisodesDeserializerTest.java | 23 + .../orf/parser/OrfMoreEpisodesParserTest.java | 39 ++ .../orf/tasks/OrfFilmDetailTaskTestBase.java | 2 +- .../resources/orf/orf_film_more_episodes.html | 653 ++++++++++++++++++ .../resources/orf/orf_film_more_episodes.json | 1 + .../orf/orf_film_with_other_episodes.html | 430 ++++++++++++ 10 files changed, 1257 insertions(+), 10 deletions(-) create mode 100644 src/main/java/de/mediathekview/mserver/crawler/orf/json/OrfMoreEpisodesDeserializer.java create mode 100644 src/main/java/de/mediathekview/mserver/crawler/orf/parser/OrfMoreEpisodesParser.java create mode 100644 src/test/java/de/mediathekview/mserver/crawler/orf/json/OrfMoreEpisodesDeserializerTest.java create mode 100644 src/test/java/de/mediathekview/mserver/crawler/orf/parser/OrfMoreEpisodesParserTest.java create mode 100644 src/test/resources/orf/orf_film_more_episodes.html create mode 100644 src/test/resources/orf/orf_film_more_episodes.json create mode 100644 src/test/resources/orf/orf_film_with_other_episodes.html diff --git a/src/main/java/de/mediathekview/mserver/crawler/orf/OrfCrawler.java b/src/main/java/de/mediathekview/mserver/crawler/orf/OrfCrawler.java index 10d3dd2f1..3a439b43e 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/orf/OrfCrawler.java +++ b/src/main/java/de/mediathekview/mserver/crawler/orf/OrfCrawler.java @@ -99,13 +99,14 @@ private Queue getLetterEntries() throws InterruptedException, Execu @Override protected RecursiveTask> createCrawlerTask() { try { + boolean processMoreEpisodes = false; final Queue shows = new ConcurrentLinkedQueue<>(); if (Boolean.TRUE.equals(crawlerConfig.getTopicsSearchEnabled())) { shows.addAll(getArchiveEntries()); - addShows(shows, getLetterEntries()); + processMoreEpisodes = true; } addShows(shows, getDaysEntries()); @@ -113,7 +114,8 @@ protected RecursiveTask> createCrawlerTask() { ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), shows.size()); getAndSetMaxCount(shows.size()); - return new OrfFilmDetailTask(this, shows); + // TODO Problem mit Sport aktuell u.ä. lösen => more episodes pro show setzen (topic ja, day nein?) + return new OrfFilmDetailTask(this, shows, processMoreEpisodes); } catch (final InterruptedException ex) { LOG.debug("{} crawler interrupted.", getSender().getName(), ex); Thread.currentThread().interrupt(); diff --git a/src/main/java/de/mediathekview/mserver/crawler/orf/json/OrfMoreEpisodesDeserializer.java b/src/main/java/de/mediathekview/mserver/crawler/orf/json/OrfMoreEpisodesDeserializer.java new file mode 100644 index 000000000..45b52710b --- /dev/null +++ b/src/main/java/de/mediathekview/mserver/crawler/orf/json/OrfMoreEpisodesDeserializer.java @@ -0,0 +1,26 @@ +package de.mediathekview.mserver.crawler.orf.json; + +import com.google.gson.JsonDeserializationContext; +import com.google.gson.JsonDeserializer; +import com.google.gson.JsonElement; +import de.mediathekview.mserver.base.utils.JsonUtils; +import de.mediathekview.mserver.base.utils.UrlUtils; +import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; +import de.mediathekview.mserver.crawler.orf.OrfConstants; + +import java.lang.reflect.Type; +import java.util.Optional; + +public class OrfMoreEpisodesDeserializer implements JsonDeserializer { + + private static final String ATTRIBUTE_URL = "url"; + + @Override + public CrawlerUrlDTO deserialize( + JsonElement jsonElement, Type type, JsonDeserializationContext jsonDeserializationContext) { + + final Optional url = + JsonUtils.getAttributeAsString(jsonElement.getAsJsonObject(), ATTRIBUTE_URL); + return url.map(s -> new CrawlerUrlDTO(UrlUtils.addDomainIfMissing(s, OrfConstants.URL_BASE))).orElse(null); + } +} diff --git a/src/main/java/de/mediathekview/mserver/crawler/orf/parser/OrfMoreEpisodesParser.java b/src/main/java/de/mediathekview/mserver/crawler/orf/parser/OrfMoreEpisodesParser.java new file mode 100644 index 000000000..28a6c3418 --- /dev/null +++ b/src/main/java/de/mediathekview/mserver/crawler/orf/parser/OrfMoreEpisodesParser.java @@ -0,0 +1,25 @@ +package de.mediathekview.mserver.crawler.orf.parser; + +import de.mediathekview.mserver.base.HtmlConsts; +import de.mediathekview.mserver.crawler.basic.TopicUrlDTO; +import java.util.ArrayList; +import java.util.List; +import org.jsoup.nodes.Document; + +public class OrfMoreEpisodesParser { + private static final String EPISODES_SELECTOR = "article.b-teaser > a.teaser-link"; + + public List parse(final Document document, final String topic) { + final List result = new ArrayList<>(); + + document + .select(EPISODES_SELECTOR) + .forEach( + episode -> { + final String url = episode.attr(HtmlConsts.ATTRIBUTE_HREF); + result.add(new TopicUrlDTO(topic, url)); + }); + + return result; + } +} diff --git a/src/main/java/de/mediathekview/mserver/crawler/orf/tasks/OrfFilmDetailTask.java b/src/main/java/de/mediathekview/mserver/crawler/orf/tasks/OrfFilmDetailTask.java index 1ad84f51e..1c494c830 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/orf/tasks/OrfFilmDetailTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/orf/tasks/OrfFilmDetailTask.java @@ -8,18 +8,18 @@ import de.mediathekview.mlib.daten.GeoLocations; import de.mediathekview.mlib.daten.Resolution; import de.mediathekview.mserver.base.utils.HtmlDocumentUtils; -import de.mediathekview.mserver.crawler.basic.AbstractCrawler; -import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask; -import de.mediathekview.mserver.crawler.basic.AbstractUrlTask; -import de.mediathekview.mserver.crawler.basic.TopicUrlDTO; +import de.mediathekview.mserver.crawler.basic.*; import de.mediathekview.mserver.crawler.orf.OrfEpisodeInfoDTO; import de.mediathekview.mserver.crawler.orf.OrfVideoInfoDTO; +import de.mediathekview.mserver.crawler.orf.json.OrfMoreEpisodesDeserializer; +import de.mediathekview.mserver.crawler.orf.parser.OrfMoreEpisodesParser; import de.mediathekview.mserver.crawler.orf.parser.OrfPlaylistDeserializer; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.jsoup.nodes.Document; +import java.io.IOException; import java.lang.reflect.Type; import java.net.MalformedURLException; import java.net.URL; @@ -29,6 +29,7 @@ import java.time.format.DateTimeParseException; import java.time.temporal.ChronoUnit; import java.util.*; +import java.util.concurrent.ConcurrentLinkedQueue; public class OrfFilmDetailTask extends AbstractDocumentTask { @@ -40,21 +41,25 @@ public class OrfFilmDetailTask extends AbstractDocumentTask { private static final String DURATION_SELECTOR = VIDEO_META_DATA_SELECTOR + " span.duration"; private static final String DESCRIPTION_SELECTOR = ".description-container .description-text"; private static final String VIDEO_SELECTOR = "div.jsb_VideoPlaylist"; + private static final String MORE_EPISODES_SELECTOR = "div.more-episodes"; private static final String ATTRIBUTE_DATETIME = "datetime"; private static final String ATTRIBUTE_DATA_JSB = "data-jsb"; - private static final String PREFIX_AUDIO_DESCRIPTION = "AD |"; private static final DateTimeFormatter DATE_TIME_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + private static final Type CRAWLER_URL_TYPE_TOKEN = new TypeToken() {}.getType(); private static final Type LIST_EPISODEINFO_TYPE_TOKEN = new TypeToken>() {}.getType(); + private final boolean processMoreEpisodes; public OrfFilmDetailTask( - final AbstractCrawler aCrawler, final Queue aUrlToCrawlDtos) { + final AbstractCrawler aCrawler, final Queue aUrlToCrawlDtos, boolean processMoreEpisodes) { super(aCrawler, aUrlToCrawlDtos); + + this.processMoreEpisodes = processMoreEpisodes; } private static Optional parseDate(final Document aDocument) { @@ -147,12 +152,22 @@ protected void processDocument(final TopicUrlDTO aUrlDto, final Document aDocume episode.getDuration()); } } + + if (processMoreEpisodes) { + final List topicUrlDTOS = parseMoreEpisodes(aDocument, aUrlDto.getTopic()); + topicUrlDTOS.remove(aUrlDto); + processMoreEpisodes(topicUrlDTOS); + } } @Override protected AbstractUrlTask createNewOwnInstance( final Queue aUrlsToCrawl) { - return new OrfFilmDetailTask(crawler, aUrlsToCrawl); + return createNewOwnInstance(aUrlsToCrawl, true); + } + + private AbstractUrlTask createNewOwnInstance(final Queue urlsToCrawl, boolean processMoreEpisodes) { + return new OrfFilmDetailTask(crawler, urlsToCrawl, processMoreEpisodes); } private void createFilm( @@ -255,4 +270,37 @@ private List parseEpisodes(final Document aDocument) { return new ArrayList<>(); } + + private List parseMoreEpisodes(final Document document, final String topic) { + final Optional json = HtmlDocumentUtils.getElementAttributeString(MORE_EPISODES_SELECTOR, ATTRIBUTE_DATA_JSB, document); + if (json.isPresent()) { + final Gson gson = + new GsonBuilder() + .registerTypeAdapter(CRAWLER_URL_TYPE_TOKEN, new OrfMoreEpisodesDeserializer()) + .create(); + + CrawlerUrlDTO moreEpisodesUrl = gson.fromJson(json.get(), CRAWLER_URL_TYPE_TOKEN); + if (moreEpisodesUrl != null) { + try { + final Document moreEpisodesDocument = crawler.requestBodyAsHtmlDocument(moreEpisodesUrl.getUrl()); + OrfMoreEpisodesParser parser = new OrfMoreEpisodesParser(); + return parser.parse(moreEpisodesDocument, topic); + } catch (IOException e) { + LOG.error("OrfFilmDetailTask: loading more episodes url {} failed.", moreEpisodesUrl.getUrl()); + crawler.incrementAndGetErrorCount(); + } + } + } + + return new ArrayList<>(); + } + + private void processMoreEpisodes(final List moreFilms) { + if (moreFilms != null && !moreFilms.isEmpty()) { + final Queue queue = new ConcurrentLinkedQueue<>(moreFilms); + final OrfFilmDetailTask task = (OrfFilmDetailTask) createNewOwnInstance(queue, false); + task.fork(); + taskResults.addAll(task.join()); + } + } } diff --git a/src/test/java/de/mediathekview/mserver/crawler/orf/json/OrfMoreEpisodesDeserializerTest.java b/src/test/java/de/mediathekview/mserver/crawler/orf/json/OrfMoreEpisodesDeserializerTest.java new file mode 100644 index 000000000..b7f536160 --- /dev/null +++ b/src/test/java/de/mediathekview/mserver/crawler/orf/json/OrfMoreEpisodesDeserializerTest.java @@ -0,0 +1,23 @@ +package de.mediathekview.mserver.crawler.orf.json; + +import com.google.gson.JsonElement; +import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; +import de.mediathekview.mserver.testhelper.JsonFileReader; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class OrfMoreEpisodesDeserializerTest { + + @Test + void testDeserialize() { + final JsonElement jsonElement = JsonFileReader.readJson("/orf/orf_film_more_episodes.json"); + + final OrfMoreEpisodesDeserializer target = new OrfMoreEpisodesDeserializer(); + final CrawlerUrlDTO actual = target.deserialize(jsonElement, null, null); + + assertNotNull(actual); + assertEquals("https://tvthek.orf.at/lane-plus/other_episodes_of_profile?profileId=13895917&profileSlug=Biester", actual.getUrl()); + + } +} diff --git a/src/test/java/de/mediathekview/mserver/crawler/orf/parser/OrfMoreEpisodesParserTest.java b/src/test/java/de/mediathekview/mserver/crawler/orf/parser/OrfMoreEpisodesParserTest.java new file mode 100644 index 000000000..63893dc77 --- /dev/null +++ b/src/test/java/de/mediathekview/mserver/crawler/orf/parser/OrfMoreEpisodesParserTest.java @@ -0,0 +1,39 @@ +package de.mediathekview.mserver.crawler.orf.parser; + +import de.mediathekview.mserver.crawler.basic.TopicUrlDTO; +import de.mediathekview.mserver.testhelper.FileReader; +import org.hamcrest.MatcherAssert; +import org.hamcrest.Matchers; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +class OrfMoreEpisodesParserTest { + @Test + void parseDocumentWithEpisodes() { + TopicUrlDTO[] expectedFilms = new TopicUrlDTO[] { + new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-9/14207236"), + new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-8/14207235"), + new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-7/14207234"), + new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-6/14207233"), + new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-5/14207232"), + new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-4/14207231"), + new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-3/14207230"), + new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-2/14207229"), + new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Alle-Folgen-jetzt-Biester-1-10/14207227"), + new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-10/14207252"), + }; + + final Document document = Jsoup.parse(FileReader.readFile("/orf/orf_film_more_episodes.html")); + + OrfMoreEpisodesParser target = new OrfMoreEpisodesParser(); + final List actual = target.parse(document, "Biester"); + + assertEquals(10, actual.size()); + MatcherAssert.assertThat(actual, Matchers.containsInAnyOrder(expectedFilms)); + } +} diff --git a/src/test/java/de/mediathekview/mserver/crawler/orf/tasks/OrfFilmDetailTaskTestBase.java b/src/test/java/de/mediathekview/mserver/crawler/orf/tasks/OrfFilmDetailTaskTestBase.java index 4f5f39632..85a0ee3fd 100644 --- a/src/test/java/de/mediathekview/mserver/crawler/orf/tasks/OrfFilmDetailTaskTestBase.java +++ b/src/test/java/de/mediathekview/mserver/crawler/orf/tasks/OrfFilmDetailTaskTestBase.java @@ -12,7 +12,7 @@ public OrfFilmDetailTaskTestBase() { } protected Set executeTask(OrfCrawler crawler, String aTheme, String aRequestUrl) { - return new OrfFilmDetailTask(crawler, createCrawlerUrlDto(aTheme, aRequestUrl)) + return new OrfFilmDetailTask(crawler, createCrawlerUrlDto(aTheme, aRequestUrl), false) .invoke(); } } diff --git a/src/test/resources/orf/orf_film_more_episodes.html b/src/test/resources/orf/orf_film_more_episodes.html new file mode 100644 index 000000000..163aa2557 --- /dev/null +++ b/src/test/resources/orf/orf_film_more_episodes.html @@ -0,0 +1,653 @@ + diff --git a/src/test/resources/orf/orf_film_more_episodes.json b/src/test/resources/orf/orf_film_more_episodes.json new file mode 100644 index 000000000..4321de98e --- /dev/null +++ b/src/test/resources/orf/orf_film_more_episodes.json @@ -0,0 +1 @@ +{"url": "/lane-plus/other_episodes_of_profile?profileId=13895917&profileSlug=Biester", "classes": "has-datetime", "force_count": 12, "remove_element": "Sendung-14207252" } \ No newline at end of file diff --git a/src/test/resources/orf/orf_film_with_other_episodes.html b/src/test/resources/orf/orf_film_with_other_episodes.html new file mode 100644 index 000000000..422bc09a6 --- /dev/null +++ b/src/test/resources/orf/orf_film_with_other_episodes.html @@ -0,0 +1,430 @@ + + + + + + + Biester: Folge 10 vom 30.12.2023 um 04:05 Uhr – ORF-TVthek + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + + + + + + + + + +
+ + +
+
+
+ +

Ihr Browser hat die automatische Wiedergabe von Videos deaktiviert.

+
+ Hilfe +
+
+ +
+
+
+
+
+
+
+
+ +
+
+
+
+
+ + + +
+ + +
+
+
+ + +

+ Film & Serie | + + Neue ORF-Serie +

+

Biester: Folge 10

+

+ Pius und Günter sind sich plötzlich näher, als ihnen lieb ist. Und auch Dorit und Sandra haben nun einiges gemeinsam. Jenny erkennt, dass ihr "Plan" nicht aufgegangen ist und sie trotz der ganzen Anstrengung jetzt genau so schlecht dastehen, wie vorher. Aber Vero gibt nicht auf. Die Besties setzen zum finalen Schlag an….
+ Besetzung: Anja Pichler, Mara Romei, Fanni Schneider, Theresa Riess, Felix Oitzinger u.a.
+ Regie: Andreas Kopriva
+ Drehbuch: Uli Brée
+ Bildquelle: ORF/MR-Film +

+
+ + +
+ +
+ +
+ + +
+ +
+ + +
+ + + + +
+ +
+ +
+ + +
+
+ + + + + + + + + + + + + + + + + +
+
+
+
+ + + + +