Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/955 #956

Merged
merged 2 commits into from
Jan 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -99,21 +99,24 @@ private Queue<TopicUrlDTO> getLetterEntries() throws InterruptedException, Execu
@Override
protected RecursiveTask<Set<Film>> createCrawlerTask() {
try {
boolean processMoreEpisodes = false;

final Queue<TopicUrlDTO> shows = new ConcurrentLinkedQueue<>();

if (Boolean.TRUE.equals(crawlerConfig.getTopicsSearchEnabled())) {
shows.addAll(getArchiveEntries());

addShows(shows, getLetterEntries());
processMoreEpisodes = true;
} else {
addShows(shows, getDaysEntries());
processMoreEpisodes = false;
}
addShows(shows, getDaysEntries());

printMessage(
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), shows.size());
getAndSetMaxCount(shows.size());

return new OrfFilmDetailTask(this, shows);
return new OrfFilmDetailTask(this, shows, processMoreEpisodes);
} catch (final InterruptedException ex) {
LOG.debug("{} crawler interrupted.", getSender().getName(), ex);
Thread.currentThread().interrupt();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package de.mediathekview.mserver.crawler.orf.json;

import com.google.gson.JsonDeserializationContext;
import com.google.gson.JsonDeserializer;
import com.google.gson.JsonElement;
import de.mediathekview.mserver.base.utils.JsonUtils;
import de.mediathekview.mserver.base.utils.UrlUtils;
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
import de.mediathekview.mserver.crawler.orf.OrfConstants;

import java.lang.reflect.Type;
import java.util.Optional;

public class OrfMoreEpisodesDeserializer implements JsonDeserializer<CrawlerUrlDTO> {

private static final String ATTRIBUTE_URL = "url";

@Override
public CrawlerUrlDTO deserialize(
JsonElement jsonElement, Type type, JsonDeserializationContext jsonDeserializationContext) {

final Optional<String> url =
JsonUtils.getAttributeAsString(jsonElement.getAsJsonObject(), ATTRIBUTE_URL);
return url.map(s -> new CrawlerUrlDTO(UrlUtils.addDomainIfMissing(s, OrfConstants.URL_BASE))).orElse(null);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package de.mediathekview.mserver.crawler.orf.parser;

import de.mediathekview.mserver.base.HtmlConsts;
import de.mediathekview.mserver.crawler.basic.TopicUrlDTO;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.nodes.Document;

public class OrfMoreEpisodesParser {
private static final String EPISODES_SELECTOR = "article.b-teaser > a.teaser-link";

public List<TopicUrlDTO> parse(final Document document, final String topic) {
final List<TopicUrlDTO> result = new ArrayList<>();

document
.select(EPISODES_SELECTOR)
.forEach(
episode -> {
final String url = episode.attr(HtmlConsts.ATTRIBUTE_HREF);
result.add(new TopicUrlDTO(topic, url));
});

return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,18 @@
import de.mediathekview.mlib.daten.GeoLocations;
import de.mediathekview.mlib.daten.Resolution;
import de.mediathekview.mserver.base.utils.HtmlDocumentUtils;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask;
import de.mediathekview.mserver.crawler.basic.AbstractUrlTask;
import de.mediathekview.mserver.crawler.basic.TopicUrlDTO;
import de.mediathekview.mserver.crawler.basic.*;
import de.mediathekview.mserver.crawler.orf.OrfEpisodeInfoDTO;
import de.mediathekview.mserver.crawler.orf.OrfVideoInfoDTO;
import de.mediathekview.mserver.crawler.orf.json.OrfMoreEpisodesDeserializer;
import de.mediathekview.mserver.crawler.orf.parser.OrfMoreEpisodesParser;
import de.mediathekview.mserver.crawler.orf.parser.OrfPlaylistDeserializer;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.nodes.Document;

import java.io.IOException;
import java.lang.reflect.Type;
import java.net.MalformedURLException;
import java.net.URL;
Expand All @@ -29,6 +29,7 @@
import java.time.format.DateTimeParseException;
import java.time.temporal.ChronoUnit;
import java.util.*;
import java.util.concurrent.ConcurrentLinkedQueue;

public class OrfFilmDetailTask extends AbstractDocumentTask<Film, TopicUrlDTO> {

Expand All @@ -40,21 +41,25 @@ public class OrfFilmDetailTask extends AbstractDocumentTask<Film, TopicUrlDTO> {
private static final String DURATION_SELECTOR = VIDEO_META_DATA_SELECTOR + " span.duration";
private static final String DESCRIPTION_SELECTOR = ".description-container .description-text";
private static final String VIDEO_SELECTOR = "div.jsb_VideoPlaylist";
private static final String MORE_EPISODES_SELECTOR = "div.more-episodes";

private static final String ATTRIBUTE_DATETIME = "datetime";
private static final String ATTRIBUTE_DATA_JSB = "data-jsb";

private static final String PREFIX_AUDIO_DESCRIPTION = "AD |";

private static final DateTimeFormatter DATE_TIME_FORMATTER =
DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");

private static final Type CRAWLER_URL_TYPE_TOKEN = new TypeToken<CrawlerUrlDTO>() {}.getType();
private static final Type LIST_EPISODEINFO_TYPE_TOKEN =
new TypeToken<List<OrfEpisodeInfoDTO>>() {}.getType();
private final boolean processMoreEpisodes;

public OrfFilmDetailTask(
final AbstractCrawler aCrawler, final Queue<TopicUrlDTO> aUrlToCrawlDtos) {
final AbstractCrawler aCrawler, final Queue<TopicUrlDTO> aUrlToCrawlDtos, boolean processMoreEpisodes) {
super(aCrawler, aUrlToCrawlDtos);

this.processMoreEpisodes = processMoreEpisodes;
}

private static Optional<LocalDateTime> parseDate(final Document aDocument) {
Expand Down Expand Up @@ -147,12 +152,22 @@ protected void processDocument(final TopicUrlDTO aUrlDto, final Document aDocume
episode.getDuration());
}
}

if (processMoreEpisodes) {
final List<TopicUrlDTO> topicUrlDTOS = parseMoreEpisodes(aDocument, aUrlDto.getTopic());
topicUrlDTOS.remove(aUrlDto);
processMoreEpisodes(topicUrlDTOS);
}
}

@Override
protected AbstractUrlTask<Film, TopicUrlDTO> createNewOwnInstance(
final Queue<TopicUrlDTO> aUrlsToCrawl) {
return new OrfFilmDetailTask(crawler, aUrlsToCrawl);
return createNewOwnInstance(aUrlsToCrawl, processMoreEpisodes);
}

private AbstractUrlTask<Film, TopicUrlDTO> createNewOwnInstance(final Queue<TopicUrlDTO> urlsToCrawl, boolean processMoreEpisodes) {
return new OrfFilmDetailTask(crawler, urlsToCrawl, processMoreEpisodes);
}

private void createFilm(
Expand Down Expand Up @@ -255,4 +270,37 @@ private List<OrfEpisodeInfoDTO> parseEpisodes(final Document aDocument) {

return new ArrayList<>();
}

private List<TopicUrlDTO> parseMoreEpisodes(final Document document, final String topic) {
final Optional<String> json = HtmlDocumentUtils.getElementAttributeString(MORE_EPISODES_SELECTOR, ATTRIBUTE_DATA_JSB, document);
if (json.isPresent()) {
final Gson gson =
new GsonBuilder()
.registerTypeAdapter(CRAWLER_URL_TYPE_TOKEN, new OrfMoreEpisodesDeserializer())
.create();

CrawlerUrlDTO moreEpisodesUrl = gson.fromJson(json.get(), CRAWLER_URL_TYPE_TOKEN);
if (moreEpisodesUrl != null) {
try {
final Document moreEpisodesDocument = crawler.requestBodyAsHtmlDocument(moreEpisodesUrl.getUrl());
OrfMoreEpisodesParser parser = new OrfMoreEpisodesParser();
return parser.parse(moreEpisodesDocument, topic);
} catch (IOException e) {
LOG.error("OrfFilmDetailTask: loading more episodes url {} failed.", moreEpisodesUrl.getUrl());
crawler.incrementAndGetErrorCount();
}
}
}

return new ArrayList<>();
}

private void processMoreEpisodes(final List<TopicUrlDTO> moreFilms) {
if (moreFilms != null && !moreFilms.isEmpty()) {
final Queue<TopicUrlDTO> queue = new ConcurrentLinkedQueue<>(moreFilms);
final OrfFilmDetailTask task = (OrfFilmDetailTask) createNewOwnInstance(queue, false);
task.fork();
taskResults.addAll(task.join());
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package de.mediathekview.mserver.crawler.orf.json;

import com.google.gson.JsonElement;
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
import de.mediathekview.mserver.testhelper.JsonFileReader;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.*;

class OrfMoreEpisodesDeserializerTest {

@Test
void testDeserialize() {
final JsonElement jsonElement = JsonFileReader.readJson("/orf/orf_film_more_episodes.json");

final OrfMoreEpisodesDeserializer target = new OrfMoreEpisodesDeserializer();
final CrawlerUrlDTO actual = target.deserialize(jsonElement, null, null);

assertNotNull(actual);
assertEquals("https://tvthek.orf.at/lane-plus/other_episodes_of_profile?profileId=13895917&profileSlug=Biester", actual.getUrl());

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package de.mediathekview.mserver.crawler.orf.parser;

import de.mediathekview.mserver.crawler.basic.TopicUrlDTO;
import de.mediathekview.mserver.testhelper.FileReader;
import org.hamcrest.MatcherAssert;
import org.hamcrest.Matchers;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.jupiter.api.Test;

import java.util.List;

import static org.junit.jupiter.api.Assertions.*;

class OrfMoreEpisodesParserTest {
@Test
void parseDocumentWithEpisodes() {
TopicUrlDTO[] expectedFilms = new TopicUrlDTO[] {
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-9/14207236"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-8/14207235"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-7/14207234"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-6/14207233"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-5/14207232"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-4/14207231"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-3/14207230"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-2/14207229"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Alle-Folgen-jetzt-Biester-1-10/14207227"),
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-10/14207252"),
};

final Document document = Jsoup.parse(FileReader.readFile("/orf/orf_film_more_episodes.html"));

OrfMoreEpisodesParser target = new OrfMoreEpisodesParser();
final List<TopicUrlDTO> actual = target.parse(document, "Biester");

assertEquals(10, actual.size());
MatcherAssert.assertThat(actual, Matchers.containsInAnyOrder(expectedFilms));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ public OrfFilmDetailTaskTestBase() {
}

protected Set<Film> executeTask(OrfCrawler crawler, String aTheme, String aRequestUrl) {
return new OrfFilmDetailTask(crawler, createCrawlerUrlDto(aTheme, aRequestUrl))
return new OrfFilmDetailTask(crawler, createCrawlerUrlDto(aTheme, aRequestUrl), false)
.invoke();
}
}
Loading
Loading