Skip to content

Commit

Permalink
ORF: extend topic search
Browse files Browse the repository at this point in the history
  • Loading branch information
alex1702 committed Jan 8, 2024
2 parents 9c4d052 + 04535af commit 8f562be
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 13 deletions.
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ sourceCompatibility = JavaVersion.VERSION_17
targetCompatibility = JavaVersion.VERSION_17
group = 'de.mediathekview'
archivesBaseName = "MServer"
version = '3.1.226'
version = '3.1.227'

def jarName = 'MServer.jar'
def mainClass = 'mServer.Main'
Expand Down
17 changes: 10 additions & 7 deletions src/main/java/mServer/crawler/sender/orf/OrfCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -74,28 +74,31 @@ private ConcurrentLinkedQueue<TopicUrlDTO> getLetterEntries() throws Interrupted
@Override
protected RecursiveTask<Set<DatenFilm>> createCrawlerTask() {

boolean processMoreEpisodes = false;

final ConcurrentLinkedQueue<TopicUrlDTO> shows = new ConcurrentLinkedQueue<>();
try {

if (CrawlerTool.loadLongMax()) {
shows.addAll(getLetterEntries());
shows.addAll(getArchiveEntries());
processMoreEpisodes = true;
} else {
getDaysEntries().forEach(show -> {
if (!shows.contains(show)) {
shows.add(show);
}
});
}

getDaysEntries().forEach(show -> {
if (!shows.contains(show)) {
shows.add(show);
}
});

} catch (InterruptedException | ExecutionException exception) {
Log.errorLog(56146546, exception);
}
Log.sysLog("ORF Anzahl: " + shows.size());

meldungAddMax(shows.size());

return new OrfFilmDetailTask(this, shows);
return new OrfFilmDetailTask(this, shows, processMoreEpisodes);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package mServer.crawler.sender.orf.json;

import com.google.gson.JsonDeserializationContext;
import com.google.gson.JsonDeserializer;
import com.google.gson.JsonElement;
import mServer.crawler.sender.base.CrawlerUrlDTO;
import mServer.crawler.sender.base.JsonUtils;
import mServer.crawler.sender.base.UrlUtils;
import mServer.crawler.sender.orf.OrfConstants;

import java.lang.reflect.Type;
import java.util.Optional;

public class OrfMoreEpisodesDeserializer implements JsonDeserializer<CrawlerUrlDTO> {

private static final String ATTRIBUTE_URL = "url";

@Override
public CrawlerUrlDTO deserialize(
JsonElement jsonElement, Type type, JsonDeserializationContext jsonDeserializationContext) {

final Optional<String> url =
JsonUtils.getAttributeAsString(jsonElement.getAsJsonObject(), ATTRIBUTE_URL);
return url.map(s -> new CrawlerUrlDTO(UrlUtils.addDomainIfMissing(s, OrfConstants.URL_BASE))).orElse(null);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package mServer.crawler.sender.orf.parser;


import mServer.crawler.sender.orf.TopicUrlDTO;
import org.jsoup.nodes.Document;

import java.util.ArrayList;
import java.util.List;

public class OrfMoreEpisodesParser {
private static final String EPISODES_SELECTOR = "article.b-teaser > a.teaser-link";
private static final String ATTRIBUTE_HREF = "href";

public List<TopicUrlDTO> parse(final Document document, final String topic) {
final List<TopicUrlDTO> result = new ArrayList<>();

document
.select(EPISODES_SELECTOR)
.forEach(
episode -> {
final String url = episode.attr(ATTRIBUTE_HREF);
result.add(new TopicUrlDTO(topic, url));
});

return result;
}
}
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
package mServer.crawler.sender.orf.tasks;

import mServer.crawler.sender.base.AbstractUrlTask;
import mServer.crawler.sender.base.*;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.reflect.TypeToken;
import de.mediathekview.mlib.daten.DatenFilm;
import de.mediathekview.mlib.tool.Log;

import java.io.IOException;
import java.lang.reflect.Type;
import java.time.Duration;
import java.time.LocalDateTime;
Expand All @@ -19,11 +21,11 @@
import java.util.concurrent.ConcurrentLinkedQueue;
import mServer.crawler.CrawlerTool;
import mServer.crawler.sender.MediathekReader;
import mServer.crawler.sender.base.Qualities;
import mServer.crawler.sender.base.HtmlDocumentUtils;
import mServer.crawler.sender.orf.OrfEpisodeInfoDTO;
import mServer.crawler.sender.orf.OrfVideoInfoDTO;
import mServer.crawler.sender.orf.TopicUrlDTO;
import mServer.crawler.sender.orf.json.OrfMoreEpisodesDeserializer;
import mServer.crawler.sender.orf.parser.OrfMoreEpisodesParser;
import mServer.crawler.sender.orf.parser.OrfPlaylistDeserializer;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
Expand All @@ -36,6 +38,7 @@ public class OrfFilmDetailTask extends OrfTaskBase<DatenFilm, TopicUrlDTO> {
private static final String DURATION_SELECTOR = VIDEO_META_DATA_SELECTOR + " span.duration";
private static final String DESCRIPTION_SELECTOR = ".description-container .description-text";
private static final String VIDEO_SELECTOR = "div.jsb_VideoPlaylist";
private static final String MORE_EPISODES_SELECTOR = "div.more-episodes";

private static final String ATTRIBUTE_DATETIME = "datetime";
private static final String ATTRIBUTE_DATA_JSB = "data-jsb";
Expand All @@ -50,12 +53,18 @@ public class OrfFilmDetailTask extends OrfTaskBase<DatenFilm, TopicUrlDTO> {
private static final DateTimeFormatter TIME_FORMAT
= DateTimeFormatter.ofPattern("HH:mm:ss");

private static final Type CRAWLER_URL_TYPE_TOKEN = new TypeToken<CrawlerUrlDTO>() {}.getType();
private static final Type LIST_EPISODEINFO_TYPE_TOKEN = new TypeToken<List<OrfEpisodeInfoDTO>>() {
}.getType();

private final boolean processMoreEpisodes;
private final transient JsoupConnection jsoupConnection;

public OrfFilmDetailTask(final MediathekReader aCrawler,
final ConcurrentLinkedQueue<TopicUrlDTO> aUrlToCrawlDTOs) {
final ConcurrentLinkedQueue<TopicUrlDTO> aUrlToCrawlDTOs, boolean processMoreEpisodes) {
super(aCrawler, aUrlToCrawlDTOs);
this.processMoreEpisodes = processMoreEpisodes;
jsoupConnection = new JsoupConnection();
}

@Override
Expand All @@ -76,12 +85,22 @@ protected void processDocument(TopicUrlDTO aUrlDTO, Document aDocument) {
}
}

if (processMoreEpisodes) {
final List<TopicUrlDTO> topicUrlDTOS = parseMoreEpisodes(aDocument, aUrlDTO.getTopic());
topicUrlDTOS.remove(aUrlDTO);
processMoreEpisodes(topicUrlDTOS);
}

ORF_LOGGER.trace(String.format("%s - %s: Anzahl Filme: %d", aUrlDTO.getTopic(), aUrlDTO.getUrl(), taskResults.size()));
}

@Override
protected AbstractUrlTask<DatenFilm, TopicUrlDTO> createNewOwnInstance(ConcurrentLinkedQueue<TopicUrlDTO> aURLsToCrawl) {
return new OrfFilmDetailTask(crawler, aURLsToCrawl);
return createNewOwnInstance(aURLsToCrawl, processMoreEpisodes);
}

private AbstractUrlTask<DatenFilm, TopicUrlDTO> createNewOwnInstance(final ConcurrentLinkedQueue<TopicUrlDTO> urlsToCrawl, boolean processMoreEpisodes) {
return new OrfFilmDetailTask(crawler, urlsToCrawl, processMoreEpisodes);
}

private void createFilm(final TopicUrlDTO aUrlDTO,
Expand Down Expand Up @@ -218,4 +237,36 @@ private static Optional<ChronoUnit> determineChronoUnit(String aDuration) {

return Optional.empty();
}

private List<TopicUrlDTO> parseMoreEpisodes(final Document document, final String topic) {
final Optional<String> json = HtmlDocumentUtils.getElementAttributeString(MORE_EPISODES_SELECTOR, ATTRIBUTE_DATA_JSB, document);
if (json.isPresent()) {
final Gson gson =
new GsonBuilder()
.registerTypeAdapter(CRAWLER_URL_TYPE_TOKEN, new OrfMoreEpisodesDeserializer())
.create();

CrawlerUrlDTO moreEpisodesUrl = gson.fromJson(json.get(), CRAWLER_URL_TYPE_TOKEN);
if (moreEpisodesUrl != null) {
try {
final Document moreEpisodesDocument = jsoupConnection.getDocument(moreEpisodesUrl.getUrl());
OrfMoreEpisodesParser parser = new OrfMoreEpisodesParser();
return parser.parse(moreEpisodesDocument, topic);
} catch (IOException e) {
Log.errorLog(237462889, String.format("OrfFilmDetailTask: loading more episodes url %s failed.", moreEpisodesUrl.getUrl()));
}
}
}

return new ArrayList<>();
}

private void processMoreEpisodes(final List<TopicUrlDTO> moreFilms) {
if (moreFilms != null && !moreFilms.isEmpty()) {
final ConcurrentLinkedQueue<TopicUrlDTO> queue = new ConcurrentLinkedQueue<>(moreFilms);
final OrfFilmDetailTask task = (OrfFilmDetailTask) createNewOwnInstance(queue, false);
task.fork();
taskResults.addAll(task.join());
}
}
}

0 comments on commit 8f562be

Please sign in to comment.