Skip to content

Commit

Permalink
init POC
Browse files Browse the repository at this point in the history
  • Loading branch information
codingPF committed Jun 9, 2024
1 parent d85c395 commit 6eba7cb
Show file tree
Hide file tree
Showing 9 changed files with 614 additions and 5 deletions.
11 changes: 6 additions & 5 deletions MServer-Config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ maximumRequestsPerSecond: 999.0
# If set only these Sender will be crawled all other will be ignored.
senderIncluded:
#- ARD
#- ARTE_DE
- ARTE_DE
#- ARGE_FR
#- ARTE_EN
#- ARTE_PL
Expand All @@ -32,7 +32,7 @@ senderIncluded:
#- PHOENIX
#- SRF
#- SR
- ZDF
#- ZDF

#SRF,SR,PHONIX,ORF,KIKA,DW,3SAT<

Expand Down Expand Up @@ -159,9 +159,10 @@ senderConfigurations:
ORF:
maximumRequestsPerSecond: 10.0
ARTE_DE:
maximumUrlsPerTask: 1
maximumDaysForSendungVerpasstSectionFuture: 0
maximumRequestsPerSecond: 2.0
maximumSubpages: 2
#maximumUrlsPerTask: 1
#maximumDaysForSendungVerpasstSectionFuture: 0
#maximumRequestsPerSecond: 2.0
ARTE_FR:
maximumDaysForSendungVerpasstSectionFuture: 0
# The maximum amount of URLs to be processed per task.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package de.mediathekview.mserver.crawler.artem;

public final class ArteMConstants {
//
public static final int PAGE_LIMIT = 100;
//
public static final String HOST = "https://api.arte.tv";
//
public static final String ALL_VIDEOS = HOST + "/api/opa/v3/videos?language=de&sort=-lastModified&limit=" + PAGE_LIMIT;
//
public static final String AUTH = "Bearer Nzc1Yjc1ZjJkYjk1NWFhN2I2MWEwMmRlMzAzNjI5NmU3NWU3ODg4ODJjOWMxNTMxYzEzZGRjYjg2ZGE4MmIwOA";

private ArteMConstants() {}
//
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package de.mediathekview.mserver.crawler.artem;

import de.mediathekview.mlib.daten.Film;
import de.mediathekview.mlib.daten.Sender;
import de.mediathekview.mlib.messages.listener.MessageListener;
import de.mediathekview.mserver.base.config.MServerConfigManager;
import de.mediathekview.mserver.base.messages.ServerMessages;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
import de.mediathekview.mserver.crawler.basic.TopicUrlDTO;
import de.mediathekview.mserver.crawler.kika.json.KikaApiFilmDto;
import de.mediathekview.mserver.crawler.kika.tasks.*;
import de.mediathekview.mserver.progress.listeners.SenderProgressListener;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.util.Collection;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.RecursiveTask;

public class ArteMCrawler extends AbstractCrawler {
private static final Logger LOG = LogManager.getLogger(ArteMCrawler.class);

public ArteMCrawler(
final ForkJoinPool aForkJoinPool,
final Collection<MessageListener> aMessageListeners,
final Collection<SenderProgressListener> aProgressListeners,
final MServerConfigManager aRootConfig) {
super(aForkJoinPool, aMessageListeners, aProgressListeners, aRootConfig);
}

@Override
public Sender getSender() {
return Sender.ARTE_DE;
}

@Override
protected RecursiveTask<Set<Film>> createCrawlerTask() {

try {
// get all brands from json doc
final Queue<CrawlerUrlDTO> root = new ConcurrentLinkedQueue<>();
root.add(new CrawlerUrlDTO(ArteMConstants.ALL_VIDEOS));
final ArteMVideoTask arteMVideoTask = new ArteMVideoTask(this, root, ArteMConstants.AUTH, 0);
final Queue<ArteMVideoDto> videos = new ConcurrentLinkedQueue<>();
videos.addAll(arteMVideoTask.fork().join());
//
printMessage(ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), videos.size());
getAndSetMaxCount(videos.size());
//
return new ArteMStreamTask(this, videos, ArteMConstants.AUTH, 0);
} catch (final Exception ex) {
LOG.fatal("Exception in ARTE_DE crawler.", ex);
}

return null;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package de.mediathekview.mserver.crawler.artem;

import com.google.gson.*;

import de.mediathekview.mserver.base.utils.JsonUtils;
import de.mediathekview.mserver.crawler.basic.PagedElementListDTO;
import java.lang.reflect.Type;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;


public class ArteMSreamDeserializer implements JsonDeserializer<PagedElementListDTO<ArteMStreamDto>> {
private static final String NEXT_PAGE[] = {"meta","videoStreams", "links", "next", "href"};
private static final String ELEMENT_STREAMS = "videoStreams";
private static final String ATTR_LANGUAGE = "language";
private static final String ATTR_QUALITY = "quality";
private static final String ATTR_MIMETYPE = "mimeType";
private static final String ATTR_AUDIOCODE = "audioCode";
private static final String ATTR_URL = "url";
private static final String ELEMENT_SUBTITLES = "subtitles";
private static final String ATTR_SUBTITLES_VERSION = "version";
private static final String ATTR_SUBTITLES_FILENAME = "filename";

@Override
public PagedElementListDTO<ArteMStreamDto> deserialize(
final JsonElement jsonElement, final Type typeOfT, final JsonDeserializationContext context)
throws JsonParseException {
//
PagedElementListDTO<ArteMStreamDto> list = new PagedElementListDTO<>();
//
list.setNextPage(JsonUtils.getElementValueAsString(jsonElement, NEXT_PAGE));
//
Optional<JsonElement> videos = JsonUtils.getElement(jsonElement, ELEMENT_STREAMS);
if (videos.isEmpty()) {
return list;
}
Optional<JsonElement> subtitle = JsonUtils.getElement(jsonElement, ELEMENT_SUBTITLES);
Optional<Map<String,String>> subtitleStreams = Optional.empty();
if (subtitle.isPresent()) {
Map<String,String> subtitleEntries = new HashMap<>();
for (JsonElement sub : subtitle.get().getAsJsonArray()) {
subtitleEntries.put(
JsonUtils.getElementValueAsString(sub, ATTR_SUBTITLES_VERSION).get(),
JsonUtils.getElementValueAsString(sub, ATTR_SUBTITLES_FILENAME).get()
);
}
subtitleStreams = Optional.of(subtitleEntries);
}

for (JsonElement stream : videos.get().getAsJsonArray()) {
list.addElement(new ArteMStreamDto(
JsonUtils.getElementValueAsString(stream, ATTR_LANGUAGE),
JsonUtils.getElementValueAsString(stream, ATTR_QUALITY),
JsonUtils.getElementValueAsString(stream, ATTR_MIMETYPE),
JsonUtils.getElementValueAsString(stream, ATTR_AUDIOCODE),
JsonUtils.getElementValueAsString(stream, ATTR_URL),
subtitleStreams));
}

return list;
}


}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package de.mediathekview.mserver.crawler.artem;

import java.util.Map;
import java.util.Optional;

public class ArteMStreamDto {
Optional<String> language;
Optional<String> quality;
Optional<String> mimeType;
Optional<String> audioCode;
Optional<String> url;
Optional<Map<String,String>> subtitles;
public ArteMStreamDto(Optional<String> language, Optional<String> quality, Optional<String> mimeType,
Optional<String> audioCode, Optional<String> url, Optional<Map<String, String>> subtitles) {
super();
this.language = language;
this.quality = quality;
this.mimeType = mimeType;
this.audioCode = audioCode;
this.url = url;
this.subtitles = subtitles;
}
public Optional<String> getLanguage() {
return language;
}
public Optional<String> getQuality() {
return quality;
}
public Optional<String> getMimeType() {
return mimeType;
}
public Optional<String> getAudioCode() {
return audioCode;
}
public Optional<String> getUrl() {
return url;
}
public Optional<Map<String, String>> getSubtitles() {
return subtitles;
}



}
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
package de.mediathekview.mserver.crawler.artem;

import java.lang.reflect.Type;
import java.net.URI;
import java.net.URL;
import java.time.Duration;
import java.time.LocalDateTime;
import java.time.temporal.TemporalUnit;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Queue;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.TimeUnit;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import com.google.gson.reflect.TypeToken;

import de.mediathekview.mlib.daten.Film;
import de.mediathekview.mlib.daten.GeoLocations;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
import de.mediathekview.mserver.crawler.basic.AbstractJsonRestTask;
import de.mediathekview.mserver.crawler.basic.AbstractRecursiveConverterTask;
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
import de.mediathekview.mserver.crawler.basic.PagedElementListDTO;
import jakarta.ws.rs.core.Response;

// <T, R, D extends CrawlerUrlDTO> extends AbstractRestTask<T, D>
// return T Class from this task, desirialisation of class R , D , Reasearch in this url
public class ArteMStreamTask extends AbstractJsonRestTask<Film, PagedElementListDTO<ArteMStreamDto>, ArteMVideoDto> {
private static final long serialVersionUID = 1L;
private static final Logger LOG = LogManager.getLogger(ArteMStreamTask.class);
private int subPageIndex = 0;

protected ArteMStreamTask(AbstractCrawler crawler, Queue<ArteMVideoDto> urlToCrawlDTOs, String authKey, int subPageIndex) {
super(crawler, urlToCrawlDTOs, authKey);
this.subPageIndex = subPageIndex;
}

@Override
protected Type getType() {
return new TypeToken<List<ArteMVideoDto>>() {}.getType();
}

@Override
protected void handleHttpError(ArteMVideoDto dto, URI url, Response response) {
crawler.printErrorMessage();
LOG.fatal(
"A HTTP error {} occurred when getting REST information from: \"{}\".",
response.getStatus(),
url);
}

@Override
protected void postProcessing(PagedElementListDTO<ArteMStreamDto> aResponseObj, ArteMVideoDto aDTO) {
final Optional<AbstractRecursiveConverterTask<Film, ArteMVideoDto>> subpageCrawler;
final Optional<String> nextPageLink = aResponseObj.getNextPage();
if (nextPageLink.isPresent() && config.getMaximumSubpages() > subPageIndex) {
final Queue<ArteMVideoDto> nextPageLinks = new ConcurrentLinkedQueue<>();
ArteMVideoDto np = new ArteMVideoDto(aDTO);
np.setUrl(nextPageLink.get());
nextPageLinks.add(np);
subpageCrawler = Optional.of(createNewOwnInstance(nextPageLinks));
subpageCrawler.get().fork();
} else {
subpageCrawler = Optional.empty();
}
// Trailer
if (!aDTO.getPlatform().orElse("").equalsIgnoreCase("EXTRAIT")) {
Set<ArteMStreamDto> streams = aResponseObj.getElements();
taskResults.add(createFilm(aDTO, streams));
}
//


}

@Override
protected Object getParser(ArteMVideoDto aDTO) {
return new ArteMSreamDeserializer();
}

@Override
protected AbstractRecursiveConverterTask<Film, ArteMVideoDto> createNewOwnInstance(
Queue<ArteMVideoDto> aElementsToProcess) {
return new ArteMStreamTask(crawler, aElementsToProcess, getAuthKey().orElse(""), subPageIndex+1);
}

private Film createFilm(ArteMVideoDto filmData, Set<ArteMStreamDto> streams) {
Film film = new Film(
UUID.randomUUID(),
crawler.getSender(),
filmData.getSubtitle().orElse(""),
filmData.getTitle().get(),
parseDate(filmData.getCreationDate().get()).get(),
parseDuration(filmData.getDurationSeconds().get()).get()
);
film.setBeschreibung(filmData.getShortDescription().get());
film.setWebsite(parseWebsite(filmData.getWebsite().get()).get());
film.addGeolocation(parseGeo(filmData.getGeoblockingZone().get()));
streams.stream().findAny().get().getSubtitles();
return film;
}

private Set<URL> parseSubtitle(Optional<Map<String, String>> data) {
return null;
}

private GeoLocations parseGeo(String data) {
switch(data) {
case "ALL":
return GeoLocations.GEO_NONE;
}
return GeoLocations.GEO_NONE;
}

private Optional<LocalDateTime> parseDate(String date) {
try {
return Optional.of(LocalDateTime.parse(date));
} catch (Exception e) {

}
return Optional.empty();
}

private Optional<Duration> parseDuration(String data) {
try {
return Optional.of(Duration.ofSeconds(Long.parseLong(data)));
} catch (Exception e) {

}
return Optional.empty();
}

private Optional<URL> parseWebsite(String data) {
try {
return Optional.of(new URL(data));
} catch (Exception e) {

}
return Optional.empty();
}

}
Loading

0 comments on commit 6eba7cb

Please sign in to comment.